def writeTDE(q, outputFileName, types): # Define type maps typesTable = { 'Bool': tde.Type.BOOLEAN, 'Int': tde.Type.INTEGER, 'Double': tde.Type.DOUBLE, 'Date': tde.Type.DATE, 'DateTime': tde.Type.DATETIME, 'Duration': tde.Type.DURATION, 'CharString': tde.Type.CHAR_STRING, 'UnicodeString': tde.Type.UNICODE_STRING } # Step 1: Create the Extract file and open the .csv tdefile = tde.Extract(outputFileName + '.tde') if tdefile.hasTable('Extract'): table = tdefile.openTable('Extract') tableDef = table.getTableDefinition() columns = q.get() numColumns = len(columns) else: # Step 2: Create the tableDef tableDef = tde.TableDefinition() columns = q.get() numColumns = len(columns) for i in range(numColumns): tableDef.addColumn(columns[i], typesTable[types[i]]) # Step 3: Create the table in the image of the tableDef table = tdefile.addTable('Extract', tableDef) # Step 4: Loop through the csv, grab all the data, put it into rows # and insert the rows into the table newrow = tde.Row(tableDef) while True: nextLine = q.get() if nextLine: for columnIndex in range(numColumns): insertRow(newrow, columnIndex, nextLine[columnIndex], types[columnIndex]) table.insert(newrow) else: break # Step 5: Close the tde tdefile.close()
def write_tde(table_df, tde_fullpath, arg_append): """ Writes the given pandas dataframe to the Tableau Data Extract given by tde_fullpath """ if arg_append and not os.path.isfile(tde_fullpath): print "Couldn't append -- file doesn't exist" arg_append = False # Remove it if already exists if not arg_append and os.path.exists(tde_fullpath): os.remove(tde_fullpath) tdefile = tde.Extract(tde_fullpath) # define the table definition table_def = tde.TableDefinition() # create a list of column names colnames = table_df.columns # create a list of column types coltypes = table_df.dtypes # for each column, add the appropriate info the Table Definition for col_idx in range(0, len(colnames)): cname = colnames[col_idx] ctype = fieldMap[str(coltypes[col_idx])] table_def.addColumn(cname, ctype) # create the extract from the Table Definition if arg_append: tde_table = tdefile.openTable('Extract') else: tde_table = tdefile.addTable('Extract', table_def) row = tde.Row(table_def) for r in range(0, table_df.shape[0]): for c in range(0, len(coltypes)): if str(coltypes[c]) == 'float64': row.setDouble(c, table_df.iloc[r, c]) elif str(coltypes[c]) == 'float32': row.setDouble(c, table_df.iloc[r, c]) elif str(coltypes[c]) == 'int64': row.setDouble(c, table_df.iloc[r, c]) elif str(coltypes[c]) == 'int32': row.setDouble(c, table_df.iloc[r, c]) elif str(coltypes[c]) == 'object': row.setString(c, table_df.iloc[r, c]) elif str(coltypes[c]) == 'bool': row.setBoolean(c, table_df.iloc[r, c]) else: row.setNull(c) # insert the row tde_table.insert(row) tdefile.close() print "Wrote %d lines to %s" % (len(table_df), tde_fullpath)
def pd_tde(df,fname): try: os.system('rm -f ' + fname) os.system('rm -f DataExtract*.log') tdefile = tde.Extract(fname) except: tdefile = tde.Extract(fname) tableDef = tde.TableDefinition() colnames = df.columns coltypes = df.dtypes # for each column, add the appropriate info the Table Definition for i in range(0, len(colnames)): cname = colnames[i] ctype = fieldMap.get(str(coltypes[i])) tableDef.addColumn(cname, ctype) with tdefile as extract: table = extract.addTable("Extract", tableDef) for r in range(0, df.shape[0]): row = tde.Row(tableDef) for c in range(0, len(coltypes)): if str(coltypes[c]) == 'float64': row.setDouble(c, df.iloc[r,c]) elif str(coltypes[c]) == 'float32': row.setDouble(c, df.iloc[r,c]) elif str(coltypes[c]) == 'int64': row.setDouble(c, df.iloc[r,c]) elif str(coltypes[c]) == 'int32': row.setDouble(c, df.iloc[r,c]) elif str(coltypes[c]) == 'object': row.setString(c, df.iloc[r,c]) elif str(coltypes[c]) == 'bool': row.setBoolean(c, df.iloc[r,c]) else: row.setNull(c) # insert the row table.insert(row)
csvreader = csv.DictReader(csvfile, delimiter=csvdelimiter, quotechar=csvquotechar, fieldnames=dfields) # we have to make our own field names for row in csvreader: for i in dfields: dtypes.append(str(type(datatyper(row[i])))) break # got stuff, we're out csvfile.seek(0) # BACK TO THE FRONT! (AGAIN!) os.chdir(targetdir) try: # Just in case the file exists already, we don't want to bomb out tdefile = tde.Extract(tdefilename) # in CWD except: os.system('del ' + targetdir + tdefilename) os.system( 'del DataExtract.log') #might as well erase this bitch too tdefile = tde.Extract(targetdir + tdefilename) # ok lets build the table definition in TDE with our list of names and types first # replacing literals with TDE datatype integers, etc tableDef = tde.TableDefinition() #create a new table def numfields = len(dfields) #print numfields if rowoutput == True: print '*** field names list ***' # debug
if type(s) == unicode: return s.encode('utf-8').strip() else: return s # create the connection and get all my bundles c = bitly_api.Connection(access_token=config['ACCESS_TOKEN']) #j = c.bundle_bundles_by_user(config['u']) j = c.user_bundle_history() myBundles = j['bundles'] # open the extract try: # Just for testing purposes and re-running tdefile = tde.Extract('bitlyDetails.tde') #in CWD except: os.remove('bitlyDetails.tde') os.remove('DataExtract.log') tdefile = tde.Extract('bitlyDetails.tde') print 'tde ok' # make the extract and define the columns tableDef = tde.TableDefinition() tableDef.addColumn("bundle_link", tde.Type.CHAR_STRING) tableDef.addColumn("bundle_title", tde.Type.CHAR_STRING) tableDef.addColumn("bundle_owner", tde.Type.CHAR_STRING) tableDef.addColumn("bundle_created", tde.Type.DOUBLE) tableDef.addColumn("bundle_description", tde.Type.CHAR_STRING) tableDef.addColumn("bundle_last_mod", tde.Type.DOUBLE)
import dataextract as tde import os import riak import tweetpony as tw client = riak.RiakClient(host="127.0.0.1", http_port=10018) # Create the Extract File try: tdefile = tde.Extract('RiakExport-WebOnly.tde') except: os.remove('RiakExport-WebOnly.tde') tdefile = tde.Extract('RiakExport-WebOnly.tde') # Create the Tableau table definition tableDef = tde.TableDefinition() tableDef.addColumn('id', tde.Type.CHAR_STRING) tableDef.addColumn('text', tde.Type.CHAR_STRING) tableDef.addColumn('source', tde.Type.CHAR_STRING) tableDef.addColumn('retweeted', tde.Type.BOOLEAN) tableDef.addColumn('retweet_count', tde.Type.INTEGER) tableDef.addColumn('user_friends_count', tde.Type.INTEGER) tableDef.addColumn('language', tde.Type.CHAR_STRING) tableDef.addColumn('user_geo_enabled', tde.Type.BOOLEAN) tableDef.addColumn('Latitude', tde.Type.DOUBLE) tableDef.addColumn('Longitude', tde.Type.DOUBLE) tableDef.addColumn('created_at', tde.Type.DATETIME) # Step 3: Create the table in the image of tableDef table = tdefile.addTable('Extract', tableDef)
def extract(file_name): # move file to /extract # cd to /extract # if there is no extract called TRACK_TERM then create one, otherwise append to TRACK_TERM # define data model for extract global WORKING_DIRECTORY, TRACK_TERM from_path = WORKING_DIRECTORY + '/' + file_name to_path = WORKING_DIRECTORY + '/extract/' + file_name os.rename(from_path, to_path) os.chdir(WORKING_DIRECTORY + '/extract') # define the extract with tde.Extract(TRACK_TERM + '.tde') as extract: tableDef = tde.TableDefinition() # define the columns and the data types in the extract tableDef.addColumn('lang', tde.Type.CHAR_STRING) #0 tableDef.addColumn('sentiment', tde.Type.DOUBLE) #1 tableDef.addColumn('country', tde.Type.CHAR_STRING) #2 tableDef.addColumn('created_at', tde.Type.DATETIME) #3 tableDef.addColumn('tweet_text', tde.Type.CHAR_STRING) #4 tableDef.addColumn('Longitude', tde.Type.DOUBLE) #5 tableDef.addColumn('source', tde.Type.CHAR_STRING) #6 tableDef.addColumn('user', tde.Type.CHAR_STRING) #7 tableDef.addColumn('Latitude', tde.Type.DOUBLE) #8 table = None if not extract.hasTable('Extract'): # Table does not exist, so create it. print "Creating a new extract" table = extract.addTable('Extract', tableDef) else: # Table exists, so append the new data. print "Appending to an existing extract" table = extract.openTable('Extract') new_row = tde.Row(tableDef) # read the data from the CSV into the extract row object with open(file_name, 'r') as inf: reader = csv.DictReader(inf, delimiter=',', lineterminator='\n') for row in reader: # insert data into the row object in the correct order as defined above new_row.setCharString(0, row['lang']) sentiment = float(row['sentiment']) new_row.setDouble(1, sentiment) new_row.setCharString(2, row['country']) # parse the twitter date string: # Mon Sep 21 11:03:53 +0000 2015 # %a %b %d %H:%M:%S +0000 %Y date_object = datetime.strptime(row['created_at'], '%a %b %d %H:%M:%S +0000 %Y') year = int(datetime.strftime(date_object, '%Y')) month = int(datetime.strftime(date_object, '%m')) day = int(datetime.strftime(date_object, '%d')) hour = int(datetime.strftime(date_object, '%H')) min = int(datetime.strftime(date_object, '%M')) sec = int(datetime.strftime(date_object, '%S')) frac = 0 # fractions of a second aka milliseconds new_row.setDateTime(3, year, month, day, hour, min, sec, frac) new_row.setCharString(4, row['tweet_text']) # check if there is a value for longitude, otherwise write a 0 try: longitude = float(row['longitude']) except: longitude = 0 new_row.setDouble(5, longitude) new_row.setCharString(6, row['source']) new_row.setCharString(7, row['user']) # check if there is a value for latitude, otherwise write a 0 try: latitude = float(row['latitude']) except: latitude = 0 new_row.setDouble(8, latitude) table.insert(new_row) # if the process fails we want to be able to re-run it without collisions between file names # so give each file a unique name (unix time stamp in this case). os.rename(file_name, str(time.time()).split('.')[0] + '.csv') # cd back to working directory os.chdir(WORKING_DIRECTORY) return
else: Cache = InText elif Wejscie[0] == 's': #save to file FileSave(Data, Wejscie) elif Wejscie[0] == 'l': #load from file FileLoad(Data, Wejscie) else: if len(Wejscie) == 1: #only command sign - whole tree if Wejscie[0] == 'd': #display print('') datatree.Showdata(Data, 0) elif Wejscie[0] == 'r': #remove Data = [] elif Wejscie[ 0] == 'i': #insert, no index given so on the end of Data Data.append(Cache) else: # command sign and at least one index try: Arg = [int(elem) for elem in Wejscie[1:]] #all items of list of indexes Dat = dataextract.Extract(Data, Arg, Wejscie[0], Cache) except IndexError: print("\nError: wrong index") continue print('') if Wejscie[0] == 'r': print('Item has been removed:') elif Wejscie[0] == 'i': print('Item has been added:') datatree.Showdata(Dat, 0)
def createTDEFile(): # This part is unique for each extract. # Will need to modify.. # TODO: Make this more dynamic ################################################################################################ ################################################################################################ csvSchema = [] csvSchema.append({'<COLUMNNAME>': tdeTypes['INTEGER']}) ################################################################################################ ################################################################################################ # Try to create the TDE File try: tdeFile = tde.Extract(outputfile + '.tde') except: os.remove(outputfile + '.tde') tdeFile = tde.Extract(outputfile + '.tde') # Open newly created CSV File csvFile = open(outputfile + '.csv', "r") reader = csv.reader(csvFile) print 'Reading records from %s' % (outputfile + '.csv') # Create TDE Table definition tdeTableDef = tde.TableDefinition() # Build TDE Table Definition from csv schema above print 'Defined table schema:' for index, item in enumerate(csvSchema): for k, v in item.items(): print 'Column %i: %s <%s>' % ( index, k, tdeTypes.keys()[tdeTypes.values().index(v)]) tdeTableDef.addColumn(k, v) # Add table to TDE File tdeTable = tdeFile.addTable("Extract", tdeTableDef) # Iterate through rows and columns of csv and adding them to TDE File print 'Writing records to %s' % (csvName + '.tde') rownum = 0 for row in reader: if rownum == 0: header = row else: colnum = 0 tdeRow = tde.Row(tdeTableDef) for col in row: if colnum + 1 > len(csvSchema): print 'Something is missing here.' break add_tde_col(colnum, tdeRow, row[colnum], csvSchema[colnum].values()[0]) colnum += 1 tdeTable.insert(tdeRow) tdeRow.close() rownum += 1 print '%i rows added in total in %f seconds' % (rownum - 1, time.clock() - startTime) print 'Closing TDE and CSV File...' tdeFile.close() csvFile.close()
if arg_tde_filename: tde_filename = arg_tde_filename else: tde_filename = csv_filename.replace(".csv", ".tde") # print "Will write to [%s]" % os.path.join(tde_dirpath, tde_filename) # print # Step 1: Create the Extract file and open the csv tde_fullpath = os.path.join(tde_dirpath, tde_filename) # if the file doesn't exist, revoke the append if arg_append and not os.path.isfile(tde_fullpath): print "Couldn't append -- file doesn't exist." arg_append = False if not arg_append and os.path.isfile(tde_fullpath): os.remove(tde_fullpath) tdefile = tde.Extract(tde_fullpath) # Define the columns by the first csv table_df = pandas.read_csv(os.path.join(csv_dirpaths[0], csv_filename), names=arg_header) for join_table_file in arg_join: join_df = pandas.read_csv(join_table_file) table_df = pandas.merge(table_df, join_df, how='left') # Step 2: Create the tableDef tableDef = tde.TableDefinition() old_colnames = [] colnames_to_types = {} new_colnames_to_idx = {} for (col, dtype) in itertools.izip(table_df.columns, table_df.dtypes): colname = col.strip() # strip whitespace
# step 01 import requirements # ######################################################### import pandas import dataextract as tde # step 02 get the data into python # ######################################################### url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data' names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class'] dataset = pandas.read_csv(url, names=names) # step 03 create a blank extract # ######################################################### dataExtract = tde.Extract('irisExtract.tde') # step 04 define schema # ######################################################### dataSchema = tde.TableDefinition() dataSchema.addColumn('sepal-length', tde.Type.DOUBLE) dataSchema.addColumn('sepal-width', tde.Type.DOUBLE) dataSchema.addColumn('petal-length', tde.Type.DOUBLE) dataSchema.addColumn('petal-width', tde.Type.DOUBLE) dataSchema.addColumn('class', tde.Type.CHAR_STRING) # step 05 connect schema with blank extract # ######################################################### table = dataExtract.addTable('Extract', dataSchema) # step 06 fill extract with data # ######################################################### newRow = tde.Row(dataSchema)
cursor.execute(sqlQuery) # unzip Packaged Workbook file to new folder print("unzip workbook") zip = zipfile.ZipFile(path + dashboard_file) zip.extractall(path + temp_ext) zip.close() # remove existing extract file os.remove(path + temp_ext + '/Data/Datasources/' + tdeName) # always in /Data/Datasources/extract_name.tde # create new tableau extract w/ orignal name print("create tableau extract") os.chdir(path + temp_ext + '/Data/Datasources/') tdefile = tde.Extract(tdeName) # create tde file tableDef = tde.TableDefinition() # create a new table def for column in cursor.description: tableDef.addColumn(column[0], tdeTypes[str(column[1])]) tdetable = tdefile.addTable("Extract", tableDef) # has to be Extract for some reason print("adding rows") for row in cursor.fetchall(): i = 0 newrow = tde.Row(tableDef) while i < len(row): val = row[i] t = type(row[i])
df['DATE_OF_GAME'] = df['GAME_DATE'].apply( lambda x: pd.to_datetime(str(x), format='%Y%m%d')) # g = df.groupby(['PLAYER_ID','SEASON']) # g['GAME_ID'].rank(method='dense') # df['GAME_NUMBER']=g['GAME_ID'].rank(method='dense') with open('TabNBA.csv', 'a') as f: df.to_csv(f, header=False) #Run the function above for all currently active NBA players, for all seasons in the seasons list for season in seasons: for player in players: shot_charts(season, player) #Create TDE file tdefile = tde.Extract('TabNBA.tde') #Read data from the TabNBA.csv that was created in the steps above csvReader = csv.reader(open('TabNBA.csv', 'rb'), delimiter=',', quotechar='"') #Create TDE structure if tdefile.hasTable('Extract'): table = tdefile.openTable('Extract') tableDef = table.getTableDefinition() else: tableDef = tde.TableDefinition() tableDef.addColumn('SHOT_NUMBER', tde.Type.INTEGER) tableDef.addColumn('GRID_TYPE', tde.Type.CHAR_STRING) tableDef.addColumn('GAME_ID', tde.Type.INTEGER) tableDef.addColumn('GAME_EVENT_ID', tde.Type.INTEGER) tableDef.addColumn('PLAYER_ID', tde.Type.INTEGER)
import dataextract as tde import os import riak import tweetpony as tw # Set up a Riak connection client = riak.RiakClient(host="127.0.0.1", http_port=10018) bucket = client.bucket('twitter_sample') # Create the Extract File try: tdefile = tde.Extract('RiakExport.tde') except: os.remove('RiakExport.tde') tdefile = tde.Extract('RiakExport.tde') # Create the tableDef tableDef = tde.TableDefinition() tableDef.addColumn('id', tde.Type.CHAR_STRING) tableDef.addColumn('text', tde.Type.CHAR_STRING) tableDef.addColumn('source', tde.Type.CHAR_STRING) tableDef.addColumn('retweeted', tde.Type.BOOLEAN) tableDef.addColumn('retweet_count', tde.Type.INTEGER) tableDef.addColumn('user_friends_count', tde.Type.INTEGER) tableDef.addColumn('language', tde.Type.CHAR_STRING) tableDef.addColumn('user_geo_enabled', tde.Type.BOOLEAN) tableDef.addColumn('Latitude', tde.Type.DOUBLE) tableDef.addColumn('Longitude', tde.Type.DOUBLE) tableDef.addColumn('created_at', tde.Type.DATETIME) #Step 3: Create the table in the image of tableDef