def writer(header, data, filename, option): with open(filename, "w", newline="") as xlsxfile: if option == "write": movies = xlsxfile.writer(csvfile) movies.writerow(header) for x in data: movies.writerow(x) elif option == "update": writer = xlsx.DictWriter(csvfile, fieldnames=header) writer.writeheader() writer.writerows(data) else: print("Option is not known") # The data visualization: provide the user the ability to do two forms of data analysis,The first # analysis.should display the data in a color coded text format as a list in ascending or descending # order (let the user choose) and the second should render a map to visualize the data. iris = pd.read_xlsx( 'iris.xlsx', names=[ "school.city_id INT PRIMARY KEY, school.city VARCHAR() NOT NULL, school.city " "VARCHAR() NOT NULL, school.city_phone VARCHAR(6), school.city INT" ]) print(iris.head())
def trajectory(self): """ Gets the position of the cell over time. Returns a numpy array. """ if str(self.path_to_data).find( ".xlsx") != -1: # TODO make this check only last 5 chars data = pd.read_xlsx(self.path_to_data) elif str(self.path_to_data).find(".csv") != -1: data = pd.read_csv(self.path_to_data) else: raise ValueError( f"'{self.path_to_data}' is not an .xlsx or .csv file.") # Now subset the data for the given id and time data = data.loc[data['TrackID'] == self.id] # Now extract the coordinates of the cell's position at each time point # (Convert to numpy array to permit a for loop). data = data.to_numpy(na_value='NAN') trajectory = np.zeros((len(data), 4), dtype=float) for i in range(0, len(data)): row = data[i] x, y, z, t = row[0], row[1], row[2], row[4] trajectory[i][0:4] = x, y, z, t return trajectory
def getxls(self): global all_files w_dir = ''.join(list(all_files[0])[0:2]) output_file = w_dir + '/csv_result.csv' all_data_frame = [] for file in all_files: with open(file, 'rb') as f: encoding1 = chardet.detect(f.read())['encoding'] try: data_frame = pd.read_xlsx(file, encoding=encoding1, dtype={ "本端基站ID": str, "对端基站ID": str }) except Exception as err: pass # try: # data_frame = pd.read_xlsx(file, encoding='utf_8_sig') # except Exception as err: # pass all_data_frame.append(data_frame) # pandas.concat()函数将数据框数据垂直堆叠(axis=0), 当水平连接数据时(asis=1) data_frame_concat = pd.concat(all_data_frame, axis=0, ignore_index=True) data_frame_concat.to_csv(output_file, encoding=encoding1, index=False) MyFile = (output_file) with open(MyFile, 'rb') as f: MyFileCode = chardet.detect(f.read())['encoding'] df1 = pd.read_csv(MyFile, encoding=MyFileCode, header=0, dtype={ "本端基站ID": str, "对端基站ID": str }) # 索引第1行为表头。 # print(df.head(5)) df1.fillna(value="-", inplace=True) # df2.to_csv(output_file1,encoding=encoding1) new_df1 = df1["本端基站ID"] + ";" + df1["对端基站ID"] new_df2 = df1["对端基站ID"] + ";" + df1["本端基站ID"] c = new_df1.append(new_df2) c.drop_duplicates(keep=False, inplace=True) # 这里想要说明的是,drop_duplicates当中的参数keep=False,意为重复项全部删除,它还有keep="first"与keep="last", # 分别对应在有多项重复时,保留第一项(或最后一项)。 c.reset_index(drop=True) # 不想保留原来的index,直接使用重置后的索引,那么可以使用参数drop=True,默认值是False c.name = "单向对" # 对DataFrame列的重命名d.columns=['a','b','C'];或者d.rename({'a':'A', 'b':'B'},inplace=True)。Series用s.name="newname";或者s.rename("newname1",inplace=True)。 c.to_csv(output_file1, encoding=encoding1, index=False, header=True) self.statusbar.showMessage("结果保存在:" + w_dir + "/csv_result_oneway.csv")
def predict(self, filename): """Predict on the data file.""" if 'csv' in filename: data = pd.read_csv(filename) elif 'xlsx' in filename: data = pd.read_xlsx(filename) pred, _ = self.model.predict_mlp(scale(data, features).values) prediction = pandas.DataFrame({'ID': ids}) for name in ['Ghost', 'Electron', 'Muon', 'Pion', 'Kaon', 'Proton']: prediction[name] = pred[:, label_class_correspondence[name]] prediction.to_csv('predictions.csv.gz', index=False, float_format='%.5f', compression="gzip") return FileLink('predictions.csv.gz')
def file_read(path, csv, xlsx): if csv == True and xlsx == True: messagebox.showerror("ERROR", "Bitte waehlen Sie nur 1 Datentyp aus") elif csv == True: data = pd.read_csv(path, usecols=['month', 'temp', 'prec']) elif xlsx == True: data = pd.read_xlsx(path) else: messagebox.showerror("ERROR", "Bitte waehlen Sie einen Datentyp aus") data = data.to_numpy() month = data[:, 0] temp = data[:, 1] prec = data[:, 2] return month, temp, prec
def open_file(file_path): logger.info(os.path.splitext(file_path)[-1]) if os.path.splitext(file_path)[-1] == '.parquet': df = pd.read_parquet(file_path) elif os.path.splitext(file_path)[-1] == '.csv': logger.info('read csv') df = pd.read_csv(file_path) elif os.path.splitext(file_path)[-1] == '.xlsx': logger.info('read xlsx') df = pd.read_xlsx(file_path) elif os.path.splitext(file_path)[-1] in ['.pkl', '.pickle']: logger.info('read pickle') df = pd.read_pickle(file_path) else: raise Exception( f'i dont know how to read this file extension: {os.path.splitext(file_path)[-1]}' ) return df
def getxls(self): global all_files w_dir = ''.join(list(all_files[0])[0:2]) output_file = w_dir + '/csv_result.csv' all_data_frame = [] for file in all_files: with open(file, 'rb') as f: encoding1 = chardet.detect(f.read())['encoding'] try: data_frame = pd.read_xlsx(file, encoding=encoding1) except Exception as err: pass # try: # data_frame = pd.read_xlsx(file, encoding='utf_8_sig') # except Exception as err: # pass all_data_frame.append(data_frame) # pandas.concat()函数将数据框数据垂直堆叠(axis=0), 当水平连接数据时(asis=1) data_frame_concat = pd.concat(all_data_frame, axis=0, ignore_index=True) data_frame_concat.to_csv(output_file, encoding=encoding1, index=False) self.statusbar.showMessage("结果保存在:" + w_dir + "/csv_result.csv")
password="******", database="mydatabase") # creating database_cursor to perform SQL operation db_cursor = db_connection.cursor() # executing cursor with execute method and pass SQL query db_cursor.execute("CREATE DATABASE comp490_spring_3") # get list of all databases db_cursor.execute("SHOW DATABASES") #print all databases for db in db_cursor: print(db) df = pd.read_xlsx(r'Path where the XLSX file is stored\COMP490_SPRING_3.XLSX') print(df) df = pd.read_xlsx( r'C:\Users\Electronick\OneDrive\Desktop\COMP_490\COMP490_SPRING_3.xlsx' ) #read the csv file (put 'r' before the path string to address any special characters in the path, such as '\'). print(df) conn = sqlite3.connect('TestDB.db') c = conn.cursor() # Create table - occupation occupational c.execute('''CREATE TABLE OCCUPATIONAL ([generated_id] INTEGER PRIMARY KEY,[Occupational_Name] text, [Title] text, [Date_start] dates, [Occupation] occupation, [Wage] integer, [State] text,,)''' )
import pandas as pd import numpy as np import glob path = r"/Users/matthewnock/Desktop/Coding/AFSP" files = glob.glob(path + r"*.xlsx") frames = [] for filename in files: print(filename) df = pd.read_xlsx(filename) print(df) new_columns = ("stroop","afsp","survey","other","valid_data") df['new_columns'] = np.NaN df.\name of column\.repalce('',np.NaN) df.loc[df.\name of column\ =NaN,'stroop']=df[df.\name of column\=NaN].Stroop.replace({NaN:0}) df_to.xlsx(path +'r/output/compiled') # Create a new column named accuracy and place data from the choice_prob column in the new accuracy column # Look at the Reversal column, in all instances where Reversal = 0 go to the accuracy column and change 25 to 0 and 75 to 1 # Look at the Reversal column, in all instances where Reversal = 1 go to the accuracy column and change 25 to 1 and 75 to 0 # frame['accuracy']=frame.choice_prob # frame.loc[frame.Reversal==0,'accuracy']=frame[frame.Reversal==0].accuracy.replace({25:0,75:1}) # frame.loc[frame.Reversal==1,'accuracy']=frame[frame.Reversal==1].accuracy.replace({25:1,75:0}) # Look at the participants, and for each participant take the mean of their accuracy column
# axis=1 axis='columns' import pandas as banana df_banana = banana.DataFrame({"a": [11, 21, 31], "b": [21, 22, 23]}) df_banana.head() import pandas as pd import numpy as np # Reading files csv csv_path = 'file.csv' df_read_csv = pd.read_csv(csv_path) # Reading files excel xlsx_path = 'file.xlsx' df_read_xlsx = pd.read_xlsx(xlsx_path) # Create df from a a dictionary songs = { "Album" : ["Thriller", "Back in Black", "The Dark Side of the Moon",\ "The Bodyguard", "Bast Out of Hell", \ 'Their Greatest Hits (1971-1975)', 'Saturday Night Fever', 'Rumours'], "Released" : [1982,1980,1973,1992,1977,1976,1977,1977], "Length" : ["00:42:19", "00:42:22", "00:42:49", "00:57:44", "00:46:33", "", "", ""] } df_songs = pd.DataFrame(songs) # Conocer todos los types de datos de df df_songs.dtypes df_songs.info
import pandas as pd import geopandas as gpd # Import geopandas import fiona #공간데이터를 딕셔너리 형태 등으로 접근할 수 있는 라이브러리 # import warnings # warnings.filterwarnings(action='ignore') #경고 메시지 무시 # from IPython.display import display #print가 아닌 display()로 연속 출력 # from IPython.display import HTML #출력 결과를 HTML로 생성 # CCTV CSV 로딩 df_cctv = pd.read_xlsx('./1.xlsx', encoding="EUC-KR") df_cctv.head() def RSU(longitude, latitude, v1, v2): #여러 차량의 VTI를 수집하고 기하평균? 가중평균 ''' RSU는 input으로 들어오는 v의 모든 정보를 append하여 저장시키고 GPS 데이터를 2차원 형태로 그려서 클러스터링 해야 함 그리고 클러스터링 별로 전역 신뢰도 값을 주어야 함 ''' # latitude = [] #위도(뒤) # longitude = [] #경도(앞) v1_temp = [] v2_temp = [] for key, info in v1.items(): for second_key in info: real_info = info['VDI'] for third_key in real_info: v1_temp = real_info['createdData']
import os, sys, getopt t = 0 t1, t2 = (False, False) warnings.filterwarnings("ignore") TK = tk.Tk() files = list(tkf.askopenfilenames()) savepath = "%s/split/" % list(os.path.split(files[0]))[0] os.system("mkdir %s" % savepath) # if input('where to save? Somewhere else? {y/[n]}').lower().find('y')!= -1: # savepath = tkf.askdirectory() TK.destroy() for file in tqdm(files): if "dta" in file: data = pd.read_stata(file) elif 'csv' in file: data = pd.read_csv(file) elif 'txt' in file: data = pd.read_csv(file) elif 'xls' in file: data = pd.read_xlsx(file) data = data.rename(columns={'agency_name': 'hp_name'}) try: data[data.eval('hp_type+hc+level').isna()]['level'] = data[ data.eval('hp_type+hc+level').isna()]['hp_name'].apply(level) except: data['level'] = data.hp_name.apply(level) if all(['hp_type' in data.columns, 'hc' in data.columns]): try: try: data.hp_type = data.hp_type.astype('float') except: data.hp_type = data.hp_name.astype('str').apply( get_hp_type) data.hc = data.hc.astype('float') data.level = data.level.astype('float')
####더미 변수 생성 및 데이터셋 구성#### import requests import pandas as pd import numpy as np import os import time import datetime import json from bs4 import BeautifulSoup data_set = pd.read_xlsx('C:/py_saving/movie_data/total_movie.xlsx') data_set["openDt"] = data_set["openDt"].apply(lambda x: str(x)[4:6]) def month_change(i): if i == "01": return 'jan' elif i == '02': return 'feb' elif i == '03': return 'mar' elif i == '04': return 'apr' elif i == '05': return 'may' elif i == '06': return 'jun' elif i == '07': return 'jul' elif i == '08':
def __init__(self, **kwargs): # Set icon try: self.__root.wm_iconbitmap("Excel.ico") except: pass # Set window size (the default is 300x300) try: self.__thisWidth = kwargs['width'] except KeyError: pass try: self.__thisHeight = kwargs['height'] except KeyError: pass # Set the window text self.__root.title("Untitled - Excel") # Center the window screenWidth = self.__root.winfo_screenwidth() screenHeight = self.__root.winfo_screenheight() # For left-alling left = (screenWidth / 2) - (self.__thisWidth / 2) # For right-allign top = (screenHeight / 2) - (self.__thisHeight / 2) # For top and bottom self.__root.geometry('%dx%d+%d+%d' % (self.__thisWidth, self.__thisHeight, left, top)) # To make the textarea auto resizable self.__root.grid_rowconfigure(0, weight=1) self.__root.grid_columnconfigure(0, weight=1) # Add controls (widget) self.__thisTextArea.grid(sticky=N + E + S + W) # To open new file self.__thisFileMenu.add_command(label="New", command=self.__newFile) # To open a already existing file self.__thisFileMenu.add_command(label="Open", command=self.__openFile) # To save current file self.__thisFileMenu.add_command(label="Save", command=self.__saveFile) # To create a line in the dialog self.__thisFileMenu.add_separator() self.__thisFileMenu.add_command(label="Exit", command=self.__quitApplication) self.__thisMenuBar.add_cascade(label="File", menu=self.__thisFileMenu) # To give a feature of cut self.__thisEditMenu.add_command(label="Cut", command=self.__cut) # to give a feature of copy self.__thisEditMenu.add_command(label="Copy", command=self.__copy) # To give a feature of paste self.__thisEditMenu.add_command(label="Paste", command=self.__paste) # To give a feature of editing self.__thisMenuBar.add_cascade(label="Edit", menu=self.__thisEditMenu) # To create a feature of description of the notepad self.__thisHelpMenu.add_command(label="About Excel", command=self.__showAbout) self.__thisMenuBar.add_cascade(label="Help", menu=self.__thisHelpMenu) self.__root.config(menu=self.__thisMenuBar) self.__thisScrollBar.pack(side=RIGHT, fill=Y) # Scrollbar will adjust automatically according to the content self.__thisScrollBar.config(command=self.__thisTextArea.yview) self.__thisTextArea.config(yscrollcommand=self.__thisScrollBar.set) # //////////////////////////////////////////////////////////////////////////////////////////////// # When your program first starts up, with the python GUI, allow the user to choose to either # update the data run the data visualization # https://towardsdatascience.com/how-to-create-an-interactive-geographic-map-using-python # -and-bokeh-12981ca0b567 from bokeh.io import output_notebook, show, output_file from bokeh.plotting import figure from bokeh.models import GeoJSONDataSource, LinearColorMapper, ColorBar, NumeralTickFormatter from bokeh.palettes import brewer from bokeh.io.doc import curdoc from bokeh.models import Slider, HoverTool, Select from bokeh.layouts import widgetbox, row, column # Read the geojson map file for Realtor Neighborhoods into a GeoDataframe object sf = geopandas.read_file('https://raw.githubusercontent.com/JimKing100/SF_Real_Estate_Live/master' '/data/Realtor%20Neighborhoods.geojson') # Set the Coordinate Referance System (crs) for projections # ESPG code 4326 is also referred to as WGS84 lat-long projection sf.crs = {'init': 'epsg:4326'} # Rename columns in geojson map file sf = sf.rename(columns={'geometry': 'geometry', 'nbrhood': 'neighborhood_name', 'nid': 'subdist_no'}).set_geometry('geometry') # Change neighborhood id (subdist_no) for correct code for Mount Davidson Manor and for parks sf.loc[sf['neighborhood_name'] == 'Mount Davidson Manor', 'subdist_no'] = '4n' sf.loc[sf['neighborhood_name'] == 'Golden Gate Park', 'subdist_no'] = '12a' sf.loc[sf['neighborhood_name'] == 'Presidio', 'subdist_no'] = '12b' sf.loc[sf['neighborhood_name'] == 'Lincoln Park', 'subdist_no'] = '12c' sf.sort_values(by=['subdist_no']) # Determine where the visualization will be rendered output_file('filename.html') output_notebook() # Render inline in a Jupyter Notebook # Set up the figure(s) fig = figure() show(fig) df = pd.read_xlsx(r'C:\Users\Electronick\OneDrive\Desktop\COMP_490\COMP490_SPRING_3.xlsx') # Create an empty string called ticker_string ticker_string = '' # Loop through every element of `tickers` and add them and a comma to ticker_string for ticker in tickers: ticker_string += ticker ticker_string += ',' # Drop the last comma from `ticker_string` ticker_string = ticker_string[:-1] # Create the endpoint and years strings endpoints = 'chart' years = '5' # When updating the data: let the user choose the file name for the excel file def writer(header, data, filename, option): with open(filename, "w", newline="") as xlsxfile: if option == "write": movies = xlsxfile.writer(csvfile) movies.writerow(header) for x in data: movies.writerow(x) elif option == "update": writer = xlsx.DictWriter(csvfile, fieldnames=header) writer.writeheader() writer.writerows(data) else: print("Option is not known") # The data visualization: provide the user the ability to do two forms of data analysis,The first # analysis.should display the data in a color coded text format as a list in ascending or descending # order (let the user choose) and the second should render a map to visualize the data. iris = pd.read_xlsx('iris.xlsx', names=[ "school.city_id INT PRIMARY KEY, school.city VARCHAR() NOT NULL, school.city " "VARCHAR() NOT NULL, school.city_phone VARCHAR(6), school.city INT"]) print(iris.head())
def import_data(): global startdata filepath_attempts = parameters.zero while filepath_attempts < parameters.attempts_max: filepath = input(wrapper.fill('Please place the complete file path here with the file name for your dataset in .csv or .xlsx format.')) if os.path.exists(filepath): break else: filepath_attempts += 1 print(wrapper.fill(f'The file path you have tried to import is not valid, please check to make sure you have the correct file path with the file name and try again. You have {3-filepath_attempts} attempts remaining.')) if filepath_attempts == parameters.attempts_max: raise(ExceededAttempts) exit() filepath_last_chars = filepath[-3:] if filepath_last_chars == 'csv': startdata = pd.read_csv(filepath) elif filepath_last_chars == 'xlsx': startdata = pd.read_xlsx(filepath) else: print(wrapper.fill('Please make sure your file is in .csv or .xlsx format for the import data function and try again.')) startdata = pd.read_csv(fr'{filepath}') startdata.columns = startdata.columns.str.lower() global y global timevariable global group global testdate global resamplefreq global splitdf global aggregate timevar_attempts = parameters.zero while timevar_attempts < parameters.attempts_max: timevariablebox = input('What is the time variable you wish to use for your model?') timevariable = timevariablebox.lower() if timevariable in startdata.columns: break else: timevar_attempts += 1 print(f'Time variable cannot be found in data file you imported.Please try again. You have {3-timevar_attempts} attempts remaining before the script will close.') if timevar_attempts == parameters.attempts_max: raise(ExceededAttempts) exit() resample_attempt = parameters.zero while resample_attempt < parameters.attempts_max: resamplefreq = input('\n'+ wrapper.fill('What time frequency do you want for your output? Type MS for Monthly and W for weekly results. If your data is already aggregated to a weekly or monthly level please choose that option.')) if resamplefreq in ['MS','W']: break else: resample_attempt += 1 print(f'You did not select a valid frequency, please use MS for monthly time series results and W for weekly results.') if resample_attempt == attempts_max: raise(ExceededAttempts) exit() aggregate = input('\n'+ wrapper.fill('By default your data will be averaged for use in this model, if you need to sum your input data please type sum here otherwise press enter to continue.')) forecaststeps = input('\n'+ wrapper.fill('How many periods into the future would you like to generate your forecast for, by default the package has set this value to 12? Keep in mind the granularity of your data. Here 12 would be a year for monthly data while 52 would be a year for weekly data. Please use integers')) forecaststeps = int(forecaststeps) splitdf = input('\n'+ wrapper.fill('Will you need to split your dataset into a testing and training dataset for validating your model? y/n')) if splitdf.lower() == 'y': inputdate = input('\n'+ wrapper.fill('What date do you wish to split your dataset into a training and testing dataset? This is not the date the forecast will begin.')) if resamplefreq == 'MS': testdate = first_day_of_month(inputdate) else: inputdate = datetime.datetime.strptime(inputdate, '%m-%d-%Y').date() testdate = last_sunday(inputdate, 6) groupsindata = input('\n'+ 'Do you have groups in your dataset? With groups you will be able to run a seperate timeseries model for each group. Please type (Y/N)') if groupsindata.lower() == 'y': group_attempt = parameters.zero while group_attempt < parameters.attempts_max: groupbox = input('\n'+ 'What is the column name for the groups in your dataset?') group = groupbox.lower() if group in startdata.columns: startdata[group] = startdata[group].astype(str) break else: group_attempt += 1 print(f'Time variable cannot be found in data file you imported.Please try again. You have {3-group_attempt} attempts remaining before the script will close.') if group_attempt == parameters.attempts_max: raise(ExceededAttempts) exit() startdata[timevariable] = pd.to_datetime(startdata[timevariable]) startdata['period'] = (startdata[timevariable].dt.strftime('%B')) startdata['year'] = (startdata[timevariable].dt.year) normalizedata = input('\n'+ 'Do you want to normalize any monetary data? Yes/No') if normalizedata.upper() == 'YES': cpi_frame = pd.DataFrame() headers = {'Content-type': 'application/json'} endyear_cpi = datetime.date.today().year beginyear_cpi = endyear_cpi - 10 jsondata = json.dumps({"seriesid": ['CUUR0000SA0'],"startyear":beginyear_cpi, "endyear": endyear_cpi}) p = requests.get('https://api.bls.gov/publicAPI/v1/timeseries/data/', data=jsondata, headers=headers, auth = HTTPBasicAuth('apikey', 'e5f82668f98943a6becb6c6dfb08841f')) json_data = json.loads(p.text) print('\n' + wrapper.fill('You are about to run a function to generate the Consumer Price Index (CPI). The CPI can be used to account for inflation in monetary data.')) chooseyear = input('What year do you want to index data to?') #Checking to make sure the year is valid otherwise raising a custom error if int(chooseyear) < beginyear_cpi or int(chooseyear) > endyear_cpi: raise InvalidYear(beginyear_cpi, endyear_cpi) choosemonth = input(f'What month of {chooseyear} do you want to index data to?') #Checking to make sure the month is not in the future, if it is raising a custom error if int(chooseyear) == endyear_cpi and int(datetime.datetime.strptime(choosemonth.capitalize(),"%B").strftime("%m")) > (date.today().month - 1): raise InvalidMonth() #If the user inputs an integer instead of spelling out the month this will find the proper month text to avoid errors. if choosemonth.isnumeric() == True: datetime_object = datetime.datetime.strptime(choosemonth, "%m") choosemonth = datetime_object.strftime("%B") for series in json_data['Results']['series']: cs = ["series id","year","period","value"] for item in series['data']: data_ses = np.array([series['seriesID'],item['year'], item['periodName'], item['value']]) row_seperator = item['year'] + '_' + item['periodName'] cpi_f = pd.DataFrame([data_ses],[row_seperator],columns = cs) cpi_frame = cpi_frame.append(cpi_f) x = cpi_frame.loc[(cpi_frame['year'] == chooseyear)&(cpi_frame['period'] == choosemonth.capitalize()), 'value'].values cpi_frame['CPI'] = x.astype(float)/cpi_frame['value'].astype(float) cpi_frame['year'] = cpi_frame['year'].astype(int) if normalizedata.upper() == 'YES': startdata = pd.merge(startdata, cpi_frame, on = ["period", "year"], how = 'left') #Here make it to where user can input data to normalize startdata['NormalizedValue'] = startdata['Cost'] * startdata['CPI'] return startdata