def writeTDE(q, outputFileName, types): # Define type maps typesTable = { 'Bool': tde.Type.BOOLEAN, 'Int': tde.Type.INTEGER, 'Double': tde.Type.DOUBLE, 'Date': tde.Type.DATE, 'DateTime': tde.Type.DATETIME, 'Duration': tde.Type.DURATION, 'CharString': tde.Type.CHAR_STRING, 'UnicodeString': tde.Type.UNICODE_STRING } # Step 1: Create the Extract file and open the .csv tdefile = tde.Extract(outputFileName + '.tde') if tdefile.hasTable('Extract'): table = tdefile.openTable('Extract') tableDef = table.getTableDefinition() columns = q.get() numColumns = len(columns) else: # Step 2: Create the tableDef tableDef = tde.TableDefinition() columns = q.get() numColumns = len(columns) for i in range(numColumns): tableDef.addColumn(columns[i], typesTable[types[i]]) # Step 3: Create the table in the image of the tableDef table = tdefile.addTable('Extract', tableDef) # Step 4: Loop through the csv, grab all the data, put it into rows # and insert the rows into the table newrow = tde.Row(tableDef) while True: nextLine = q.get() if nextLine: for columnIndex in range(numColumns): insertRow(newrow, columnIndex, nextLine[columnIndex], types[columnIndex]) table.insert(newrow) else: break # Step 5: Close the tde tdefile.close()
def write_tde(table_df, tde_fullpath, arg_append): """ Writes the given pandas dataframe to the Tableau Data Extract given by tde_fullpath """ if arg_append and not os.path.isfile(tde_fullpath): print "Couldn't append -- file doesn't exist" arg_append = False # Remove it if already exists if not arg_append and os.path.exists(tde_fullpath): os.remove(tde_fullpath) tdefile = tde.Extract(tde_fullpath) # define the table definition table_def = tde.TableDefinition() # create a list of column names colnames = table_df.columns # create a list of column types coltypes = table_df.dtypes # for each column, add the appropriate info the Table Definition for col_idx in range(0, len(colnames)): cname = colnames[col_idx] ctype = fieldMap[str(coltypes[col_idx])] table_def.addColumn(cname, ctype) # create the extract from the Table Definition if arg_append: tde_table = tdefile.openTable('Extract') else: tde_table = tdefile.addTable('Extract', table_def) row = tde.Row(table_def) for r in range(0, table_df.shape[0]): for c in range(0, len(coltypes)): if str(coltypes[c]) == 'float64': row.setDouble(c, table_df.iloc[r, c]) elif str(coltypes[c]) == 'float32': row.setDouble(c, table_df.iloc[r, c]) elif str(coltypes[c]) == 'int64': row.setDouble(c, table_df.iloc[r, c]) elif str(coltypes[c]) == 'int32': row.setDouble(c, table_df.iloc[r, c]) elif str(coltypes[c]) == 'object': row.setString(c, table_df.iloc[r, c]) elif str(coltypes[c]) == 'bool': row.setBoolean(c, table_df.iloc[r, c]) else: row.setNull(c) # insert the row tde_table.insert(row) tdefile.close() print "Wrote %d lines to %s" % (len(table_df), tde_fullpath)
def pd_tde(df,fname): try: os.system('rm -f ' + fname) os.system('rm -f DataExtract*.log') tdefile = tde.Extract(fname) except: tdefile = tde.Extract(fname) tableDef = tde.TableDefinition() colnames = df.columns coltypes = df.dtypes # for each column, add the appropriate info the Table Definition for i in range(0, len(colnames)): cname = colnames[i] ctype = fieldMap.get(str(coltypes[i])) tableDef.addColumn(cname, ctype) with tdefile as extract: table = extract.addTable("Extract", tableDef) for r in range(0, df.shape[0]): row = tde.Row(tableDef) for c in range(0, len(coltypes)): if str(coltypes[c]) == 'float64': row.setDouble(c, df.iloc[r,c]) elif str(coltypes[c]) == 'float32': row.setDouble(c, df.iloc[r,c]) elif str(coltypes[c]) == 'int64': row.setDouble(c, df.iloc[r,c]) elif str(coltypes[c]) == 'int32': row.setDouble(c, df.iloc[r,c]) elif str(coltypes[c]) == 'object': row.setString(c, df.iloc[r,c]) elif str(coltypes[c]) == 'bool': row.setBoolean(c, df.iloc[r,c]) else: row.setNull(c) # insert the row table.insert(row)
# if we have a header, we don't want to try and process it if hasheader == True: csvreader.next() print '[', for row in csvreader: if rowoutput == True: # row deets, else just '.' print '************** INSERTING ROW NUMBER: ' + str( rowsinserted) + '**************' # debug output else: # only print dot every 50 records if (rowsinserted % dotsevery) == 0: print '=', columnposition = 0 newrow = tde.Row(tableDef) for t in range(numfields): fieldtype = dtypes[t].replace("<type '", "").replace( "'>", "").replace("<class '", "").replace('NoneType', 'str').replace('uuid.UUID', 'str') fieldname = dfields[t] if rowoutput == True: # column deets print str(columnposition) + ' ' + fieldname + ': ' + str( row[fieldname]) + ' (' + str(fieldtype).split( '.')[0] + ')' # debug output if fieldtype == 'str':
def extract(file_name): # move file to /extract # cd to /extract # if there is no extract called TRACK_TERM then create one, otherwise append to TRACK_TERM # define data model for extract global WORKING_DIRECTORY, TRACK_TERM from_path = WORKING_DIRECTORY + '/' + file_name to_path = WORKING_DIRECTORY + '/extract/' + file_name os.rename(from_path, to_path) os.chdir(WORKING_DIRECTORY + '/extract') # define the extract with tde.Extract(TRACK_TERM + '.tde') as extract: tableDef = tde.TableDefinition() # define the columns and the data types in the extract tableDef.addColumn('lang', tde.Type.CHAR_STRING) #0 tableDef.addColumn('sentiment', tde.Type.DOUBLE) #1 tableDef.addColumn('country', tde.Type.CHAR_STRING) #2 tableDef.addColumn('created_at', tde.Type.DATETIME) #3 tableDef.addColumn('tweet_text', tde.Type.CHAR_STRING) #4 tableDef.addColumn('Longitude', tde.Type.DOUBLE) #5 tableDef.addColumn('source', tde.Type.CHAR_STRING) #6 tableDef.addColumn('user', tde.Type.CHAR_STRING) #7 tableDef.addColumn('Latitude', tde.Type.DOUBLE) #8 table = None if not extract.hasTable('Extract'): # Table does not exist, so create it. print "Creating a new extract" table = extract.addTable('Extract', tableDef) else: # Table exists, so append the new data. print "Appending to an existing extract" table = extract.openTable('Extract') new_row = tde.Row(tableDef) # read the data from the CSV into the extract row object with open(file_name, 'r') as inf: reader = csv.DictReader(inf, delimiter=',', lineterminator='\n') for row in reader: # insert data into the row object in the correct order as defined above new_row.setCharString(0, row['lang']) sentiment = float(row['sentiment']) new_row.setDouble(1, sentiment) new_row.setCharString(2, row['country']) # parse the twitter date string: # Mon Sep 21 11:03:53 +0000 2015 # %a %b %d %H:%M:%S +0000 %Y date_object = datetime.strptime(row['created_at'], '%a %b %d %H:%M:%S +0000 %Y') year = int(datetime.strftime(date_object, '%Y')) month = int(datetime.strftime(date_object, '%m')) day = int(datetime.strftime(date_object, '%d')) hour = int(datetime.strftime(date_object, '%H')) min = int(datetime.strftime(date_object, '%M')) sec = int(datetime.strftime(date_object, '%S')) frac = 0 # fractions of a second aka milliseconds new_row.setDateTime(3, year, month, day, hour, min, sec, frac) new_row.setCharString(4, row['tweet_text']) # check if there is a value for longitude, otherwise write a 0 try: longitude = float(row['longitude']) except: longitude = 0 new_row.setDouble(5, longitude) new_row.setCharString(6, row['source']) new_row.setCharString(7, row['user']) # check if there is a value for latitude, otherwise write a 0 try: latitude = float(row['latitude']) except: latitude = 0 new_row.setDouble(8, latitude) table.insert(new_row) # if the process fails we want to be able to re-run it without collisions between file names # so give each file a unique name (unix time stamp in this case). os.rename(file_name, str(time.time()).split('.')[0] + '.csv') # cd back to working directory os.chdir(WORKING_DIRECTORY) return
def createTDEFile(): # This part is unique for each extract. # Will need to modify.. # TODO: Make this more dynamic ################################################################################################ ################################################################################################ csvSchema = [] csvSchema.append({'<COLUMNNAME>': tdeTypes['INTEGER']}) ################################################################################################ ################################################################################################ # Try to create the TDE File try: tdeFile = tde.Extract(outputfile + '.tde') except: os.remove(outputfile + '.tde') tdeFile = tde.Extract(outputfile + '.tde') # Open newly created CSV File csvFile = open(outputfile + '.csv', "r") reader = csv.reader(csvFile) print 'Reading records from %s' % (outputfile + '.csv') # Create TDE Table definition tdeTableDef = tde.TableDefinition() # Build TDE Table Definition from csv schema above print 'Defined table schema:' for index, item in enumerate(csvSchema): for k, v in item.items(): print 'Column %i: %s <%s>' % ( index, k, tdeTypes.keys()[tdeTypes.values().index(v)]) tdeTableDef.addColumn(k, v) # Add table to TDE File tdeTable = tdeFile.addTable("Extract", tdeTableDef) # Iterate through rows and columns of csv and adding them to TDE File print 'Writing records to %s' % (csvName + '.tde') rownum = 0 for row in reader: if rownum == 0: header = row else: colnum = 0 tdeRow = tde.Row(tdeTableDef) for col in row: if colnum + 1 > len(csvSchema): print 'Something is missing here.' break add_tde_col(colnum, tdeRow, row[colnum], csvSchema[colnum].values()[0]) colnum += 1 tdeTable.insert(tdeRow) tdeRow.close() rownum += 1 print '%i rows added in total in %f seconds' % (rownum - 1, time.clock() - startTime) print 'Closing TDE and CSV File...' tdeFile.close() csvFile.close()
# step 03 create a blank extract # ######################################################### dataExtract = tde.Extract('irisExtract.tde') # step 04 define schema # ######################################################### dataSchema = tde.TableDefinition() dataSchema.addColumn('sepal-length', tde.Type.DOUBLE) dataSchema.addColumn('sepal-width', tde.Type.DOUBLE) dataSchema.addColumn('petal-length', tde.Type.DOUBLE) dataSchema.addColumn('petal-width', tde.Type.DOUBLE) dataSchema.addColumn('class', tde.Type.CHAR_STRING) # step 05 connect schema with blank extract # ######################################################### table = dataExtract.addTable('Extract', dataSchema) # step 06 fill extract with data # ######################################################### newRow = tde.Row(dataSchema) for i in range(0, len(dataset)): newRow.setDouble (0, dataset['sepal-length'][i]) newRow.setDouble (1, dataset['sepal-width'][i]) newRow.setDouble (2, dataset['petal-length'][i]) newRow.setDouble (3, dataset['petal-width'][i]) newRow.setCharString (4, dataset['class'][i]) table.insert(newRow) # step 07 close the extract # ######################################################### dataExtract.close()