def from_csv_catalog(catalog_base): from csvcatalog import CSVCatalog c = CSVCatalog(catalog_base) profilers = [ ColumnPatternProfiler(), DataTypeDetection(), ColumnStatsProfiler() ] # FDProfiler(), ,ColumnStatsProfiler(), DataTypeDetection()]#,XSDTypeDetection()] #,ColumnRegexProfiler()]#,XSDTypeDetection()],,ColumnStatsProfiler() cnt = 0 for uri_info in c.get_uris(): print uri_info csv_file = uri_info['disk_location'] if uri_info['exception'] is None: print "none" uri = uri_info['uri'] cnt += 1 print "{}, {} -> {}".format(cnt, uri, csv_file) try: from pyyacp import YACParser yacp = YACParser(filename=csv_file, sample_size=1800, structure_detector=AdvanceStructureDetector()) table = datatable.parseDataTables(yacp, url=uri) profilers = [ ColumnByCellProfilerSet([ ColumnPatternProfiler, ColumnStatsProfiler, CharacterDistributionProfiler, BenfordsLawDistribution ]), ColumnProfilerSet([DataTypeDetection, DataTypeInterpretation]) ] apply_profilers(table, profilers=profilers) to_html(table, cnt, dir='.') except Exception as e: print(traceback.format_exc()) print(sys.exc_info()[0]) print e print 'next'
class TestDescriptionDetectionAdvanceStructureDetector(unittest.TestCase): def setUp(self): self.structure_detector = AdvanceStructureDetector() self.verbose = True def test_single_col_no_header(self): table = [['City'], ['Vienna'], ['Salzburg']] self.assertListEqual([], self.structure_detector.guess_headers( table, verbose=self.verbose)) def test_single_col_one_header(self): table = [['Name'], ['Tim Tom'], ['Max Min']] self.assertListEqual([table[0]], self.structure_detector.guess_headers( table, verbose=self.verbose)) def test_single_col_one_header1(self): table = [['Vor Nachname'], ['Tim Tom'], ['Max Min']] self.assertListEqual([], self.structure_detector.guess_headers( table, verbose=self.verbose)) def test_single_col_one_header2(self): table = [['Count'], ['10'], ['111']] self.assertListEqual([table[0]], self.structure_detector.guess_headers( table, verbose=self.verbose)) def test_multi_header(self): table = [ ['Einwohner'], ['Population'], ['1799'], ] self.assertListEqual( table[0:2], self.structure_detector.guess_headers(table, verbose=self.verbose)) def test_col1_h1_mixed(self): table = [ ['Population', 'City', 'Country'], ['1799', 'Vienna', 'Austria'], ['1799', 'Salzburg', 'Austria'], ] self.assertListEqual( table[0:1], self.structure_detector.guess_headers(table, verbose=self.verbose))
def from_csv_iter(portalID='data_wu_ac_at'): profilers = [ ColumnPatternProfiler(), DataTypeDetection(), ColumnStatsProfiler() ] #FDProfiler(), ,ColumnStatsProfiler(), DataTypeDetection()]#,XSDTypeDetection()] #,ColumnRegexProfiler()]#,XSDTypeDetection()],,ColumnStatsProfiler() cnt = 0 for uri, csv_file in csvContent_iter(portalID): cnt += 1 print "{}, {} -> {}".format(cnt, uri, csv_file) try: from pyyacp import YACParser yacp = YACParser(filename=csv_file, sample_size=1800, structure_detector=AdvanceStructureDetector()) table = datatable.parseDataTables(yacp, url=uri) profilers = [ ColumnByCellProfilerSet([ ColumnPatternProfiler, ColumnStatsProfiler, CharacterDistributionProfiler, BenfordsLawDistribution ]), ColumnProfilerSet([DataTypeDetection, DataTypeInterpretation]) ] apply_profilers(table, profilers=profilers) to_html(table, cnt, dir='.') except Exception as e: print(traceback.format_exc()) print(sys.exc_info()[0]) print e print 'next' if cnt > 10: break
# -*- coding: utf-8 -*- import pyyacp.datatable as datatable from pyyacp.table_structure_helper import AdvanceStructureDetector SAMPLES_PATH = "sample_csvs" from os import listdir from os.path import isfile, join onlyfiles = [join(SAMPLES_PATH, f) for f in listdir(SAMPLES_PATH) if isfile(join(SAMPLES_PATH, f))] for csv_file in onlyfiles: if 'multi_head_milti_table.csv' not in csv_file: continue from pyyacp import YACParser yacp = YACParser(filename=csv_file,structure_detector = AdvanceStructureDetector(),sample_size=1800) print yacp tables=datatable.parseDataTables(yacp, url='http://example.org/test', max_tables=10) for table in tables: print table.data.shape print table.data.head(5) print 'Comments', table.comments print 'Headers', table.header_rows
def setUp(self): self.structure_detector = AdvanceStructureDetector() self.verbose = True
def parseDataTables(yacpParser, url=None, batches=80, max_tables=1, raiseError=True, structure_detector=AdvanceStructureDetector()): yacpParser.seek_line(0) tables = [] cur_dt = None groups = [] def grouper(n, iterable): it = iter(iterable) while True: chunk = tuple(itertools.islice(it, n)) if not chunk: return yield chunk rows = 0 rows_to_add = [] skipped = 0 for g_rows in grouper(batches, yacpParser): rows += len(g_rows) # analys the shape of the rows r_len = map(len, g_rows) max_len = max(r_len) est_colNo = _most_common_oneliner(r_len) grouped_L = [(k, sum(1 for i in g)) for k, g in itertools.groupby(r_len)] groups += grouped_L if len(grouped_L) == 1: # perfect, one table in this batch if cur_dt is None: # we have no table, this is hte first comments = structure_detector.guess_description_lines( list(g_rows)) header = structure_detector.guess_headers(list(g_rows)) cur_dt = DataTable(yacpParser.meta, est_colNo, comments=comments, headers=header, url=url, id=len(tables)) pos = len(comments) + len(header) rows_to_add.extend(g_rows[pos:]) elif max_len == cur_dt.no_cols: rows_to_add.extend(g_rows) else: # not the same length, maybe different table , should not happen log.warning("NOT IMPLEMENTED", filename=yacpParser.url, msg="not the same length, maybe different table") else: # lets go over the groups # (2,30) -> belongs to old table # (0,1) -> empty line # (1,1) -> comment line -> flag create_new # (4,20) -> belongs to new table -> create new table, start parsing at (1,1) cur_line = 0 create_new = False for i, group in enumerate(grouped_L): if group[0] == 0: # empty line, skip skipped += 1 pass elif group[0] == 1 and group[1] < 3: # there is a group with one element, that should be the comment lines # also this means a new table if i == len(grouped_L) - 1 and [ sum(x) for x in zip(*grouped_L) ][1] < batches: # print "SUFFIX COMMENT LINES" log.warning("SUFFIX COMMENT LINES") else: # we have more groups to come, so lets start a new table from this line if not create_new: parse_start = cur_line create_new = True else: # a group with more than one column start = None if cur_dt is None or create_new: start = cur_line if create_new: start = parse_start elif group[0] == cur_dt.no_cols: #cur_dt.addRows(g_rows[cur_line:group[1]]) rows_to_add.extend(g_rows[cur_line:cur_line + group[1]]) else: # seems like a new table if group[1] != 1 or ( i == len(grouped_L) - 1 and [sum(x) for x in zip(*grouped_L)][1] == batches): # more than one row # OR at the end of the group and still a full batch start = cur_line else: #print ("NOT TREATED", group[0]) # if only one row and (at the end of the file or in the middle of a group) pass if start is not None: if cur_dt: with Timer(key="adding {} rows".format( len(rows_to_add)), verbose=True): cur_dt.addRows(rows_to_add) rows_to_add = [] tables.append(cur_dt) _rows = g_rows[start:] comments = structure_detector.guess_description_lines( _rows) header = structure_detector.guess_headers(_rows) cur_dt = DataTable(yacpParser.meta, group[0], comments=comments, headers=header, url=url, id=len(tables)) pos = len(comments) + len(header) + start end = cur_line + group[1] rows_to_add.extend(g_rows[pos:end]) create_new = False cur_line += group[1] cur_dt.addRows(rows_to_add) rows_to_add = [] tables.append(cur_dt) prev_group = None agg_groups = [] for group in groups: if prev_group is not None: if prev_group[0] == group[0]: # merge prev_group = (prev_group[0], prev_group[1] + group[1]) else: agg_groups.append(prev_group) prev_group = group else: prev_group = group agg_groups.append(prev_group) log.info("TABLE SHAPE", groups=agg_groups, filename=url) if len(tables) > max_tables: if raiseError: raise YACParserException("Too many tables (#" + str(len(tables)) + ") shapes:" + str(agg_groups)) log.info("Parsed table", skipped=skipped, tables=len(tables)) if max_tables == 1: return tables[0] else: return tables
def setUp(self): self.structure_detector = AdvanceStructureDetector()
FDProfiler(), ColumnPatternProfiler(), ColumnStatsProfiler(), DataTypeDetection() ] #,XSDTypeDetection()] #,ColumnRegexProfiler()]#,XSDTypeDetection()],,ColumnStatsProfiler() cnt = 0 t = [] for uri, csv_file in csvContent_iter(portalID): cnt += 1 print "{}, {} -> {}".format(cnt, uri, csv_file) try: from pyyacp import YACParser yacp = YACParser(filename=csv_file, sample_size=1800, structure_detector=AdvanceStructureDetector()) table = datatable.parseDataTables(yacp, url=uri) table.apply_profiler(profilers=profilers) print ">>>>>TABLE{}".format(cnt) print('_' * 80) print 'Headers', table.header_rows print '_' * 30, 'DATA {}'.format(table.data.shape), '_' * 30 print '_' * 30, 'META', '_' * 30 for k in table.meta: print '[{}] {} '.format(k, table.meta[k]) print table.colum_profiles() t.append(table) except Exception as e: