def test_string_column_labelling(self):
        tagger = GeoTagger('localhost', 27017)

        yacp = YACParser(filename='testdata/AdressenJHB.csv', sample_size=1800)
        tables = parseDataTables(yacp)
        t = tables[0]
        for i, row in enumerate(t.columnIter()):
            tagger.string_column(row)
Beispiel #2
0
def from_csv_catalog(catalog_base):
    from csvcatalog import CSVCatalog
    c = CSVCatalog(catalog_base)

    profilers = [
        ColumnPatternProfiler(),
        DataTypeDetection(),
        ColumnStatsProfiler()
    ]  # FDProfiler(), ,ColumnStatsProfiler(), DataTypeDetection()]#,XSDTypeDetection()] #,ColumnRegexProfiler()]#,XSDTypeDetection()],,ColumnStatsProfiler()

    cnt = 0
    for uri_info in c.get_uris():
        print uri_info
        csv_file = uri_info['disk_location']
        if uri_info['exception'] is None:
            print "none"
        uri = uri_info['uri']
        cnt += 1
        print "{}, {} -> {}".format(cnt, uri, csv_file)
        try:
            from pyyacp import YACParser

            yacp = YACParser(filename=csv_file,
                             sample_size=1800,
                             structure_detector=AdvanceStructureDetector())
            table = datatable.parseDataTables(yacp, url=uri)

            profilers = [
                ColumnByCellProfilerSet([
                    ColumnPatternProfiler, ColumnStatsProfiler,
                    CharacterDistributionProfiler, BenfordsLawDistribution
                ]),
                ColumnProfilerSet([DataTypeDetection, DataTypeInterpretation])
            ]
            apply_profilers(table, profilers=profilers)

            to_html(table, cnt, dir='.')

        except Exception as e:
            print(traceback.format_exc())
            print(sys.exc_info()[0])
            print e
        print 'next'
    def test_csv(self):
        client = MongoClient('localhost', 27017)
        tagger = OSMTagger(client)

        db = client.geostore
        q = db.geonames.find({'admin_level': 6, 'parent': "http://sws.geonames.org/2769848/", "country" : "http://sws.geonames.org/2782113/"})

        r_tmp = [get_geonames_id(r['_id']) for r in q]
        regions = []
        for r in r_tmp:
            regions.append(r)
            q = db.geonames.find({'admin_level': 8, 'parent': r,
                                  "country": "http://sws.geonames.org/2782113/"})
            for sub_r in q:
                regions.append(get_geonames_id(sub_r['_id']))

        yacp = YACParser(filename='testdata/AdressenJHB.csv', sample_size=1800)
        tables = parseDataTables(yacp)
        t = tables[0]
        for i, row in enumerate(t.columnIter()):
            tagger.label_values(row, regions)
Beispiel #4
0
def from_csv_iter(portalID='data_wu_ac_at'):
    profilers = [
        ColumnPatternProfiler(),
        DataTypeDetection(),
        ColumnStatsProfiler()
    ]  #FDProfiler(), ,ColumnStatsProfiler(), DataTypeDetection()]#,XSDTypeDetection()] #,ColumnRegexProfiler()]#,XSDTypeDetection()],,ColumnStatsProfiler()
    cnt = 0
    for uri, csv_file in csvContent_iter(portalID):
        cnt += 1
        print "{}, {} -> {}".format(cnt, uri, csv_file)
        try:
            from pyyacp import YACParser

            yacp = YACParser(filename=csv_file,
                             sample_size=1800,
                             structure_detector=AdvanceStructureDetector())
            table = datatable.parseDataTables(yacp, url=uri)

            profilers = [
                ColumnByCellProfilerSet([
                    ColumnPatternProfiler, ColumnStatsProfiler,
                    CharacterDistributionProfiler, BenfordsLawDistribution
                ]),
                ColumnProfilerSet([DataTypeDetection, DataTypeInterpretation])
            ]
            apply_profilers(table, profilers=profilers)

            to_html(table, cnt, dir='.')

        except Exception as e:
            print(traceback.format_exc())
            print(sys.exc_info()[0])
            print e
        print 'next'
        if cnt > 10:
            break
Beispiel #5
0
# -*- coding: utf-8 -*-
import  pyyacp.datatable as datatable
from pyyacp.table_structure_helper import AdvanceStructureDetector

SAMPLES_PATH = "sample_csvs"
from os import listdir
from os.path import isfile, join
onlyfiles = [join(SAMPLES_PATH, f) for f in listdir(SAMPLES_PATH) if isfile(join(SAMPLES_PATH, f))]



for csv_file in onlyfiles:
    if 'multi_head_milti_table.csv' not in csv_file:
        continue

    from pyyacp import YACParser

    yacp = YACParser(filename=csv_file,structure_detector = AdvanceStructureDetector(),sample_size=1800)
    print yacp


    tables=datatable.parseDataTables(yacp, url='http://example.org/test', max_tables=10)

    for table in tables:
        print table.data.shape
        print table.data.head(5)


        print 'Comments', table.comments
        print 'Headers', table.header_rows
Beispiel #6
0
    ColumnPatternProfiler(),
    ColumnStatsProfiler(),
    DataTypeDetection()
]  #,XSDTypeDetection()] #,ColumnRegexProfiler()]#,XSDTypeDetection()],,ColumnStatsProfiler()
cnt = 0
t = []
for uri, csv_file in csvContent_iter(portalID):
    cnt += 1
    print "{}, {} -> {}".format(cnt, uri, csv_file)
    try:
        from pyyacp import YACParser

        yacp = YACParser(filename=csv_file,
                         sample_size=1800,
                         structure_detector=AdvanceStructureDetector())
        table = datatable.parseDataTables(yacp, url=uri)
        table.apply_profiler(profilers=profilers)

        print ">>>>>TABLE{}".format(cnt)

        print('_' * 80)
        print 'Headers', table.header_rows
        print '_' * 30, 'DATA {}'.format(table.data.shape), '_' * 30
        print '_' * 30, 'META', '_' * 30
        for k in table.meta:
            print '[{}] {} '.format(k, table.meta[k])
        print table.colum_profiles()

        t.append(table)
    except Exception as e:
        print(traceback.format_exc())