def setUp(self): # Stream we have here: # # source ---+---> aggregate ----> aggtarget # | # +---> sample ----> map ----> target self.fields = ds.fieldlist(["a", "b", "c", "str"]) self.src_list = [[1,2,3,"a"], [4,5,6,"b"], [7,8,9,"a"]] self.target_list = [] self.aggtarget_list = [] nodes = { "source": RowListSourceNode(self.src_list, self.fields), "target": RecordListTargetNode(self.target_list), "aggtarget": RecordListTargetNode(self.aggtarget_list), "sample": SampleNode("sample"), "map": FieldMapNode(drop_fields = ["c"]), "aggregate": AggregateNode(keys = ["str"]) } connections = { ("source", "sample"), ("sample", "map"), ("map", "target"), ("source", "aggregate"), ("aggregate", "aggtarget") } self.stream = Stream(nodes, connections)
import sys import brewery.ds as ds import brewery.dq as dq from chardet.universaldetector import UniversalDetector filename = sys.argv[1] detector = UniversalDetector() for line in file(filename, 'rb'): detector.feed(line) if detector.done: break detector.close() src = ds.CSVDataSource(filename, read_header = True, encoding=detector.result["encoding"], delimiter=',' ) src.initialize() if len(src.field_names) == 1: src.finalize() src = ds.CSVDataSource(filename, read_header = True, encoding=detector.result["encoding"], delimiter=';' ) src.initialize() out = ds.CSVDataTarget(sys.stdout, encoding='utf-8') out.fields = ds.fieldlist(src.field_names) out.initialize() for record in src.records(): out.append(record) src.finalize() out.finalize()
def output_fields(self): return ds.fieldlist(["i"])
import brewery.dq as dq from chardet.universaldetector import UniversalDetector filename = sys.argv[1] detector = UniversalDetector() for line in file(filename, 'rb'): detector.feed(line) if detector.done: break detector.close() src = ds.CSVDataSource(filename, read_header=True, encoding=detector.result["encoding"], delimiter=',') src.initialize() if len(src.field_names) == 1: src.finalize() src = ds.CSVDataSource(filename, read_header=True, encoding=detector.result["encoding"], delimiter=';') src.initialize() out = ds.CSVDataTarget(sys.stdout, encoding='utf-8') out.fields = ds.fieldlist(src.field_names) out.initialize() for record in src.records(): out.append(record) src.finalize() out.finalize()