def initialize(self):
        self.stream = ds.CSVDataSource(self.resource, *self.args,
                                       **self.kwargs)

        if self.fields:
            self.stream.fields = self.fields

        self.stream.initialize()

        # FIXME: this is experimental form of usage
        self._output_fields = self.stream.fields.copy()
        self._output_fields.retype(self._retype_dictionary)
    def transform(self):
        handle = urllib2.urlopen(self.url)

        src = ds.CSVDataSource(handle,
                               encoding=self.encoding,
                               dialect=self.dialect)
        src.initialize()

        result = self.read_source_rows(src)
        handle.close()

        return result
    def transform(self):
        handle = urllib2.urlopen(self.url)

        if not self.dialect:
            if self.url.endswith('.tsv'):
                self.dialect = 'excel-tab'
            else:
                self.dialect = 'excel'

        src = ds.CSVDataSource(handle,
                               encoding=self.encoding,
                               dialect=self.dialect)
        src.initialize()

        result = self.read_source_rows(src)
        handle.close()

        return result
Exemple #4
0
            all_fields.append(field)

# Create and initialize a data target

out = ds.CSVDataTarget("merged.csv")
out.fields = brewery.FieldList(all_fields)
out.initialize()

# Append all sources

for source in sources:
    path = source["file"]

    # Initialize data source: skip reading of headers - we are preparing them ourselves
    # use XLSDataSource for XLS files
    # We ignore the fields in the header, because we have set-up fields
    # previously. We need to skip the header row.

    src = ds.CSVDataSource(path, read_header=False, skip_rows=1)
    src.fields = ds.FieldList(source["fields"])
    src.initialize()

    for record in src.records():

        # Add file reference into ouput - to know where the row comes from
        record["file"] = path
        out.append(record)

    # Close the source stream
    src.finalize()
Exemple #5
0
def gtedgePull(gtedgeInput,profs,consts):
    
    # Automatically generates .csv file of pyinput.txt files
    # to provide GTEDGE input values
    
    os.chdir(gtedgeInput)
    pyinputs=glob.glob('pyinput*')
    pyinputsConst=[]
    pyinputsProfs=[]
    sourcesConst=[]
    sourcesProfs=[]
    tempVarsConst=[]
    tempVarsProfs=[]
    varsConst=[]
    varsProfs=[]
    varNamesConst=[]
    varNamesProfs=[]
    constHeaders=[]
    profsHeaders=[]
    
    firstLine=True
    for filename in pyinputs:
        with open(filename) as openfile:
            num_lines=sum(1 for line in openfile)
            openfile.seek(0)
            if num_lines==2:
                for line in openfile:
                    if firstLine==True:
                        line=line.strip()
                        varNamesConst.append(line.split(','))
                        constHeaders.append(line.split(','))
                        firstLine=False
                    else:
                        tempVarsConst.append(line.split())
                        varsConst.append(line.split())
                sourcesConst.append({"file":filename+".csv","fields":varNamesConst[0]})
            else:
                for line in openfile:
                    if firstLine==True:
                        line=line.strip()
                        varNamesProfs.append(line.split(','))
                        profsHeaders.append(line.split(','))
                        firstLine=False
                    else:
                        tempVarsProfs.append(line.split())
                        varsProfs.append(line.split())
                sourcesProfs.append({"file":filename+".csv","fields":varNamesProfs[0]})
            firstLine=True

        with open(filename+'.csv','wb') as csvfile:
            if num_lines==2:
                csvWriter=csv.writer(csvfile,delimiter=',')
                csvWriter.writerow(varNamesConst[0])
                for a in tempVarsConst:
                    csvWriter.writerow(a)
                varNamesConst=[]
                tempVarsConst=[]
            else:
                csvWriter=csv.writer(csvfile,delimiter=',')
                csvWriter.writerow(varNamesProfs[0])
                for a in tempVarsProfs:
                    csvWriter.writerow(a)
                varNamesProfs=[]
                tempVarsProfs=[]
                         
    all_fields_Const=brewery.FieldList(["file"])
    for source in sourcesConst:
        for field in source["fields"]:
            pass
#            if field not in all_fields_Const.fields():
#                all_fields_Const.append(field)
    breweryConstOut=ds.CSVDataTarget(gtedgeInput+"_const.csv",)
    breweryConstOut.fields=brewery.FieldList(source["fields"])
    breweryConstOut.initialize()
    for source in sourcesConst:
        path=source["file"]
        src=ds.CSVDataSource(path,read_header=True,skip_rows=0)
        src.fields=ds.FieldList(source["fields"])
        src.initialize()
        breweryConstOut.field_names=(source["fields"])
        for record in src.records():
            record["file"]=path
            breweryConstOut.append(record)
        src.finalize()
    breweryConstOut.finalize()
    breweryConstOut.close_file
    
    all_fields_Profs=brewery.FieldList(["file"])
    for source in sourcesProfs:
        for field in source["fields"]:
            if field not in all_fields_Const:
                all_fields_Profs.append(field)
    breweryProfsOut=ds.CSVDataTarget(gtedgeInput+"_profs_temp.csv")
    breweryProfsOut.fields=brewery.FieldList(all_fields_Profs)
    breweryProfsOut.initialize()
    for source in sourcesProfs:
        path=source["file"]
        src=ds.CSVDataSource(path,read_header=False,skip_rows=1)
        src.fields=ds.FieldList(source["fields"])
        src.initialize()
        for record in src.records():
            record["file"]=path
            breweryProfsOut.append(record)
        src.finalize()
    breweryProfsOut.finalize()
    breweryProfsOut.close_file
    
    in_file=open(gtedgeInput+"_profs_temp.csv")
    out_file=open(gtedgeInput+"_profs.csv","wb+")
    
    for line in in_file:
        line=re.sub('\,+',',',line)
        out_file.write(line)
    out_file.close()
        
    
#    profsHeaders=sum(profsHeaders,[])        
#    all_fields=brewery.FieldList(profsHeaders)
#    breweryProfsOut=brewery.ds.CSVDataTarget(gtedgeInput+"_profs.csv")
#    breweryProfsOut.fields=all_fields
#    breweryProfsOut.initialize()
#    for source in pyinputsProfs:
#        src=brewery.ds.CSVDataSource(source,read_header=False,skip_rows=0)
#        src.fields=brewery.ds.FieldList(profsHeaders)
#        src.initialize()
#        for record in varsProfs:
#            breweryProfsOut.append(record)
#        src.finalize()
#    breweryProfsOut.finalize()
#    breweryProfsOut.close_file

#    newConst=open(gtedgeInput+"_consts.csv","wb+")
#    headers=[]
#    for file in pyinputsCons:
#        f=open(file)
#        headers.append(f.read().split(,))
        
    
    os.chdir(os.pardir)
    
    
    file_path_profs=os.path.relpath(profs)
    file_path_const=os.path.relpath(consts)
    
    f_profs=open(file_path_profs,'r')
    f_consts=open(file_path_const,'r')
Exemple #6
0
import sys
import brewery.ds as ds
import brewery.dq as dq
from chardet.universaldetector import UniversalDetector

filename = sys.argv[1]

detector = UniversalDetector()
for line in file(filename, 'rb'):
    detector.feed(line)
    if detector.done: break
detector.close()

src = ds.CSVDataSource(filename,
                       read_header=True,
                       encoding=detector.result["encoding"],
                       delimiter=',')
src.initialize()
if len(src.field_names) == 1:
    src.finalize()
    src = ds.CSVDataSource(filename,
                           read_header=True,
                           encoding=detector.result["encoding"],
                           delimiter=';')
    src.initialize()

out = ds.CSVDataTarget(sys.stdout, encoding='utf-8')
out.fields = ds.fieldlist(src.field_names)
out.initialize()
for record in src.records():
    out.append(record)