from tableausdk import Extract as tde # **Module change from py2 for py3 from tableausdk import Types as tdeTypes # ** import os #Step 1: Create the Extract File if os.path.isfile('trivialExample.tde'): os.remove('trivialExample.tde') tdefile = tde.Extract('trivialExample.tde') #Step 2: Create the tableDef tableDef = tde.TableDefinition() tableDef.addColumn('rowID', tdeTypes.Type.CHAR_STRING) # Column index 0. ** tableDef.addColumn('value', tdeTypes.Type.DOUBLE) #Column index 1. ** #Step 3: Create the table in the image of the tableDef table = tdefile.addTable('Extract', tableDef) #Step 4: Create some rows and insert them into the table newrow = tde.Row(tableDef) for i in range(1, 101): newrow.setCharString(0, 'Row ' + str(i)) newrow.setDouble(1, i) table.insert(newrow) #Step 5: Close the tde tdefile.close()
def build(self): # Set filename tdefileName = self.tde_settings_ins.tde_file if os.path.isfile(tdefileName): os.remove(tdefileName) #set the extract object instance tdefile = tdeEx.Extract(tdefileName) # create the column schema tableDef = tdeEx.TableDefinition() columnsListedInOrder = [] outputColIndex = {} outputColNumber = 0 for columnName, index in sorted( self.tde_settings_ins.columnIndex.items(), key=lambda x: x[1]): BasicDataType = self.tde_settings_ins.columns[columnName] tableDef.addColumn(columnName, tdeSettings.schemaIniTypeMap[BasicDataType]) outputColIndex[columnName] = outputColNumber outputColNumber += 1 #add the column schema to the ojbect instance of EXTRACT table = tdefile.addTable("Extract", tableDef) rowNo = 0 rowsIterator = self.tde_settings_ins.inputInfo.fileInformation.yieldRowsBase( ) printedBool = False startTime = datetime.datetime.now() print "Build start time:", datetime.datetime.strftime( startTime, '%Y-%m-%d %H:%M:%S') prevPct = -1 for myReaderRow in rowsIterator: putRow = tdeEx.Row(tableDef) for columnName, index in self.tde_settings_ins.columnIndex.items(): #if not column headers need to change this to index of the column if (tdeSettings.schemaIniTypeMap[ self.tde_settings_ins.columns[columnName]] != tde.Type.UNICODE_STRING and myReaderRow[index] == ""): putRow.setNull(outputColIndex[columnName]) else: try: #valueDecoded=myReaderRow[index].decode(self.tde_settings_ins.inputInfo.fileInformation.encodeing) self.fieldSetterMap[tdeSettings.schemaIniTypeMap[ self.tde_settings_ins.columns[columnName]]]( putRow, outputColIndex[columnName], myReaderRow[index]) except: print "column name", columnName, "contains invalid data" print "value: ", myReaderRow[index] print sys.exc_info()[0] raise table.insert(putRow) # Output progress line #only print percent one time when we hit that percent rowNo += 1 pct = math.floor( self.tde_settings_ins.inputInfo.fileInformation.getPercent( rowNo)) # print "row:",rowNo # print "pct:",pct # print "value: ",pct %2 if pct % 2 == 0: if printedBool == False or prevPct != pct: print "Percent:", round(pct, 0), "%, row:", rowNo printedBool = True prevPct = pct else: printedBool = False print "tde file built to:", tdefileName endTime = datetime.datetime.now() print "End Time:", endTime diff = endTime - startTime print "Build elapsed time:", diff
def create(target_filepath, dataframe): column = [] ## 宣告Dataframe(左)pandas資料格式 對應 到TDE的資料格式(右)。 fieldMap = { 'float64': tde.Types.Type.DOUBLE, 'float32': tde.Types.Type.DOUBLE, 'int64': tde.Types.Type.INTEGER, 'int8': tde.Types.Type.INTEGER, 'object': tde.Types.Type.UNICODE_STRING, 'bool': tde.Types.Type.DOUBLE, 'datetime64[ns]': tde.Types.Type.DATETIME } ## 若TDE已存在,則刪除。 if os.path.isfile(target_filepath): os.remove(target_filepath) tdefile = tde.Extract(target_filepath) schema = tde.TableDefinition() # define the table definition colnames = dataframe.columns # dataframe all column name coltypes = dataframe.dtypes # dataframe all column's datatype ## 根據Dataframe建立Tde Schema for i in range(0, len(colnames)): cname = colnames[i] ctype = fieldMap.get(str(coltypes[i])) schema.addColumn(cname, ctype) # 加入一行欄位(名稱,資料格式) column.append(cname) ## 開啟TDE FILE寫入資料 with tdefile as extract: table = extract.addTable("Extract", schema) for r in range(0, dataframe.shape[0]): row = tde.Row(schema) ## Loop 顯示剩餘筆數 count = dataframe.shape[0] - r - 1 print('剩' + str(count) + '筆', end="\r") ## 每筆資料依照格式寫入TDE內 for c in range(0, len(coltypes)): if str(coltypes[c]) == 'float64': if dataframe.iloc[r, c] is None: row.setNull(c) else: row.setDouble(c, dataframe.iloc[r, c]) elif str(coltypes[c]) == 'float32': row.setDouble(c, dataframe.iloc[r, c]) elif str(coltypes[c]) == 'int64': if dataframe.iloc[r, c] is None: row.setNull(c) else: row.setInteger(c, dataframe.iloc[r, c]) elif str(coltypes[c]) == 'int8': row.setInteger(c, dataframe.iloc[r, c]) elif str(coltypes[c]) == 'object': if dataframe.iloc[r, c] is None: row.setNull(c) else: row.setString(c, dataframe.iloc[r, c]) elif str(coltypes[c]) == 'bool': row.setBoolean(c, dataframe.iloc[r, c]) elif str(coltypes[c]) == 'datetime64[ns]': try: dt = dataframe.iloc[r, c] row.setDateTime(c, dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second, 0) except: row.setNull(c) else: row.setNull(c) ## insert the row table.insert(row) ## close the tdefile tdefile.close()
def extract(file_name): # move file to /extract # cd to /extract # if there is no extract called TRACK_TERM then create one, otherwise append to TRACK_TERM # define data model for extract global WORKING_DIRECTORY, TRACK_TERM if not os.path.exists(WORKING_DIRECTORY + '/extract/'): os.makedirs(WORKING_DIRECTORY + '/extract/') from_path = WORKING_DIRECTORY + '/' + file_name to_path = WORKING_DIRECTORY + '/extract/' + file_name os.rename(from_path, to_path) os.chdir(WORKING_DIRECTORY + '/extract') # define the extract with data.Extract(TRACK_TERM + '.tde') as extract: tableDef = data.TableDefinition() # define the columns and the data types in the extract tableDef.addColumn('lang', types.Type.CHAR_STRING) #0 tableDef.addColumn('sentiment', types.Type.DOUBLE) #1 tableDef.addColumn('country', types.Type.CHAR_STRING) #2 tableDef.addColumn('created_at', types.Type.DATETIME) #3 tableDef.addColumn('tweet_text', types.Type.CHAR_STRING) #4 tableDef.addColumn('Longitude', types.Type.DOUBLE) #5 tableDef.addColumn('source', types.Type.CHAR_STRING) #6 tableDef.addColumn('user', types.Type.CHAR_STRING) #7 tableDef.addColumn('Latitude', types.Type.DOUBLE) #8 table = None if not extract.hasTable('Extract'): # Table does not exist, so create it. print "Creating a new extract" table = extract.addTable('Extract', tableDef) else: # Table exists, so append the new data. print "Appending to an existing extract" table = extract.openTable('Extract') new_row = data.Row(tableDef) # read the data from the CSV into the extract row object with open(file_name, 'r') as inf: reader = csv.DictReader(inf, delimiter=',', lineterminator='\n') for row in reader: # insert data into the row object in the correct order as defined above new_row.setCharString(0, row['lang']) sentiment = float(row['sentiment']) new_row.setDouble(1, sentiment) new_row.setCharString(2, row['country']) # parse the twitter date string: # Mon Sep 21 11:03:53 +0000 2015 # %a %b %d %H:%M:%S +0000 %Y date_object = datetime.strptime(row['created_at'], '%a %b %d %H:%M:%S +0000 %Y') year = int(datetime.strftime(date_object, '%Y')) month = int(datetime.strftime(date_object, '%m')) day = int(datetime.strftime(date_object, '%d')) hour = int(datetime.strftime(date_object, '%H')) min = int(datetime.strftime(date_object, '%M')) sec = int(datetime.strftime(date_object, '%S')) frac = 0 # fractions of a second aka milliseconds new_row.setDateTime(3, year, month, day, hour, min, sec, frac) new_row.setCharString(4, row['tweet_text']) # check if there is a value for longitude, otherwise write a 0 try: longitude = float(row['longitude']) except: longitude = 0 new_row.setDouble(5, longitude) new_row.setCharString(6, row['source']) new_row.setCharString(7, row['user']) # check if there is a value for latitude, otherwise write a 0 try: latitude = float(row['latitude']) except: latitude = 0 new_row.setDouble(8, latitude) table.insert(new_row) # if the process fails we want to be able to re-run it without collisions between file names # so give each file a unique name (unix time stamp in this case). os.rename(file_name, str(time.time()).split('.')[0] + '.csv') # cd back to working directory os.chdir(WORKING_DIRECTORY) return