def check_reformat_data(indir, outdir, startt, endt, pstatus, lack_data, rdnf, LSN2='7a', similarity=0.7, mindepth=10, min_minutes=timedelta(minutes=10), percentage_acceptable=0.25): """ input: indir:input directory LSN2: the first two letters in lowell_sn, for example:Lowell_SN is '7a4c', the LSN2 is '7a', the default value of LSN2 is '7a' rdnf: In this file include the VP_NUM HULL_NUM and VESSEL_NAME check:vessel name,vessel number,serial number, lat,lon add VP_NUM function: fix the format of value, below is the right format the header like this: Probe Type Lowell Serial Number c572 Vessel Number 28 VP_NUM 310473 Vessel Name Dawn_T Date Format YYYY-MM-DD Time Format HH24:MI:SS Temperature C Depth m the value like this: HEADING Datet(GMT) Lat Lon Temperature(C) Depth(m) DATA 2019-03-30 10:37:00 4002.1266 7006.9986 7.71 0.79 DATA 2019-03-30 10:38:30 4002.1289 7006.9934 7.76 24.2 DATA 2019-03-30 10:40:00 4002.1277 7006.9933 7.79 1.20 the depth must make sure have some value bigger than mindepth(this is a parameter, the default value is 10) if all of depth value is bigger than mindepth, output the logger have some issue """ #Read telemetry status file and raw data name file telemetrystatus_df = rdm.read_telemetrystatus(pstatus) raw_data_name_df = pd.read_csv(rdnf, sep='\t') #produce a dataframe that use to calculate the number of files total_df = pd.concat([ telemetrystatus_df.loc[:, ['Boat']][:], pd.DataFrame(data=[['Total']], columns=['Boat']) ], ignore_index=True) total_df.insert(1, 'file_total', 0) total_df['Boat'] = total_df['Boat'].map(lambda x: x.replace(' ', '_')) #get all the files under the input folder and screen out the file of '.csv',and put the path+name in the allfile_lists allfile_lists = zl.list_all_files(indir) file_lists = [] for file in allfile_lists: fpath, fname = os.path.split(file) #get the file's path and name time_str = fname.split('.')[0].split('_')[2] + ' ' + fname.split( '.')[0].split('_')[3] time_gmt = datetime.strptime(time_str, "%Y%m%d %H%M%S") #time_local=zl.utc2local(time_gmt)#UTC time to local time if file[len(file) - 4:] == '.csv': if startt <= time_gmt <= endt: file_lists.append(file) #start check the data and save in the output_dir for file in file_lists: fpath, fname = os.path.split(file) #get the file's path and name #fix the file name #fname=file.split('/')[len(file.split('/'))-1] fname = file.split('\\')[len(file.split('\\')) - 1] if len(fname.split('_') [1]) == 2: # if the serieal number is only 2 digits make it 4 new_fname = fname[:3] + LSN2 + fname[3:] else: new_fname = fname #read header and data try: df_head = zl.nrows_len_to(file, 2, name=['key', 'value']) df = zl.skip_len_to(file, 2) #data except KeyboardInterrupt: sys.exit() except: print("worthless file:" + file) continue #vessel_name=fpath.split('/')[-2:-1][0] #get the vessel name vessel_name = fpath.split('\\')[-2:-1][0] #check the format of the data if len(df.iloc[0] ) == 5: # some files absent the "DATA" in the first column df.insert(0, 'HEADING', 'DATA') df.columns = [ 'HEADING', 'Datet(GMT)', 'Lat', 'Lon', 'Temperature(C)', 'Depth(m)' ] #rename the name of conlum of data df['Depth(m)'] = df['Depth(m)'].map( lambda x: '{0:.2f}'.format(float(x))) #keep two decimal fraction #Jim&Mingchao 10,Mar,2020 filter the values that constant in >5 records dfs = df['Depth(m)'].map( lambda x: float(x)) #change type of str to float diffs = np.diff(dfs) u, c = np.unique(diffs, return_counts=True) if len(c[np.where(u == 0)]) > len( df ) * percentage_acceptable: #JiM added the len() around the first part Sep 2020 & corrected spelling #print('pressure problem:'+file) print('NOTE: pressure problem in ' + fname + ' from ' + vessel_name) # JiM cleaned up these messages Sep 2020 rdm.Write_Text(lack_data, file, reason='pressure problem') continue #Jim&Mingchao 10,Mar,2020 filter the values not enough min minutes dts = pd.to_datetime(df['Datet(GMT)']) total_diffs = dts[len(dts) - 1] - dts[0] if total_diffs < min_minutes: #print('bad data! time not more than 10 minutes:'+file) print('NOTE: Haul less than 10 minutes for ' + fname + ' from ' + vessel_name) # JiM cleaned up these messages Sep 2020) rdm.Write_Text(lack_data, file, reason='bad data! time not more than 10 minutes' ) #record the name of file exists problem continue datacheck, count = 1, 0 for i in range( len(df)): #the value of count is 0 if the data is test data count = count + (float(df['Depth(m)'][i]) > mindepth ) # keep track of # of depths>mindepth if count > 5: if count == i + 1: print('please change the file:' + file + ' make sure the logger is work well!') datacheck = 0 break if datacheck == 0: print(vessel_name + ':logger have issue:' + file) rdm.Write_Text(lack_data, file, reason='logger have issue' ) #record the name of file exists problem continue if count == 0: #if the file is test file,print it print("test file:" + file) rdm.Write_Text(lack_data, file, reason="test file") continue try: df['Temperature(C)'] = df['Temperature(C)'].map( lambda x: '{0:.2f}'.format(float(x)) ) #keep two decimal fraction #keep the lat and lon data format is right,such as 00000.0000w to 0000.0000 df['Lon'] = df['Lon'].map( lambda x: '{0:.4f}'.format(float(rdm.format_lat_lon(x)))) df['Lat'] = df['Lat'].map(lambda x: '{0:.4f}'.format( float(rdm.format_lat_lon(x)))) #keep four decimal fraction except: rdm.Write_Text(lack_data, file, reason='data is not enough') continue #Check if the header file contains all the information, and if it is wrong, fix it. for j in range(len(df_head)): #check and fix the vessel number if df_head['key'][j].lower() == 'Vessel Number'.lower(): for i in range(len(telemetrystatus_df)): if telemetrystatus_df['Boat'][i].lower( ) == vessel_name.lower(): df_head['value'][j] = str( telemetrystatus_df['Vessel#'][i]) break break header_file_fixed_key = [ 'Date Format', 'Time Format', 'Temperature', 'Depth' ] header_file_fixed_value = ['YYYY-MM-DD', 'HH24:MI:SS', 'C', 'm'] EXIST, loc = 0, 0 for fixed_t in header_file_fixed_key: for k in range(len(df_head['key'])): if fixed_t.lower() == df_head['key'][k].lower(): break else: EXIST = 1 count = k + 1 if EXIST == 1: df_head = pd.concat([ df_head[:count], pd.DataFrame(data=[[fixed_t, header_file_fixed_value[loc]] ], columns=['key', 'value']) ], ignore_index=True) loc += 1 for i in range(len(total_df) ): #caculate the number of every vessel and boat files if total_df['Boat'][i].lower() == vessel_name.lower(): total_df['file_total'][i] = total_df['file_total'][i] + 1 #if the vessel name and serial number are exist, find the location of them vessel_name_EXIST, S_number_EXIST = 0, 0 for k in df_head.index: if df_head['key'][k].lower() == 'Vessel Name'.lower(): vessel_name_EXIST = 1 df_head['value'][k] = vessel_name if df_head['key'][k].lower() == 'Serial Number'.lower(): df_head['value'][k] = df_head['value'][k].replace(':', '') S_number_EXIST = 1 #check and fix the vessel name and serial number if S_number_EXIST == 0: df_head = pd.concat([ df_head[:1], pd.DataFrame(data=[['Serial Number', new_fname.split('_')[1]]], columns=['key', 'value']), df_head[1:] ], ignore_index=True) if vessel_name_EXIST == 0: # df_head = pd.concat([ df_head[:2], pd.DataFrame(data=[['Vessel Name', vessel_name]], columns=['key', 'value']), df_head[2:] ], ignore_index=True) for i in df_head.index: if df_head['key'][i].lower() == 'Vessel Number'.lower(): loc_vp_header = i + 1 break for i in raw_data_name_df.index: ratio = zl.str_similarity_ratio( vessel_name.lower(), raw_data_name_df['VESSEL_NAME'][i].lower()) ratio_best = 0 if ratio > similarity: if ratio > ratio_best: ratio_best = ratio loc_vp_file = i df_head=pd.concat([df_head[:loc_vp_header],pd.DataFrame(data=[['VP_NUM',raw_data_name_df['VP_NUM'][loc_vp_file]]],\ columns=['key','value']),df_head[loc_vp_header:]],ignore_index=True) #creat the path and name of the new_file and the temperature file output_path = fpath.replace(indir, outdir) if not os.path.exists( output_path ): #check the path of the save file is exist,make it if not os.makedirs(output_path) #df_head.to_csv(output_path+'/'+new_fname,index=0,header=0) df_head.to_csv(output_path + '\\' + new_fname, index=0, header=0) df.to_csv(output_path + '\\df_tem.csv', index=0) #produce the temperature file #add the two file in one file and delet the temperature file #os.system('cat '+output_path+'\\df_tem.csv'+' >> '+output_path+'\\'+new_fname) os.system('type ' + output_path + '\\df_tem.csv' + ' >> ' + output_path + '\\' + new_fname) os.remove(output_path + '\\df_tem.csv') # #caculate the total of all files and print save as a file. try: for i in range(len(total_df) - 1): total_df['file_total'][len(total_df) - 1] = total_df['file_total'][ len(total_df) - 1] + total_df['file_total'][i] total_df.to_csv(outdir + '\\items_number.txt', index=0) except KeyboardInterrupt: sys.exit() except: print("no valuable file!")
else: #match the file index = telemetrystatus_df['Boat'] #set the index for dictionary raw_dict = { } #the dictinary about raw data, use to write the data about 'time','filename','mean_temp','mean_depth' tele_dict = { } #the dictionary about telementry data,use to write the data about'time','mean_temp','mean_depth' for i in range(len(index)): #loop every boat raw_dict[index[i]] = pd.DataFrame( data=None, columns=['time', 'filename', 'mean_temp', 'mean_depth']) tele_dict[index[i]] = pd.DataFrame( data=None, columns=['time', 'mean_temp', 'mean_depth']) for file in file_lists: # loop raw files fpath, fname = os.path.split(file) #get the file's path and name # now, read header and data of every file header_df = zl.nrows_len_to(file, 2, name=['key', 'value']) #only header data_df = zl.skip_len_to(file, 2) #only data #get the vessel number of every file for i in range(len(header_df)): if header_df['key'][i].lower() == 'vessel number'.lower(): vessel_number = int(header_df['value'][i]) break #caculate the number of raw files in every vessel for i in range(len(record_file_df)): if record_file_df['Vessel#'][i] == vessel_number: if record_file_df['file_number'].isnull()[i]: record_file_df['file_number'][i] = 1 else: record_file_df['file_number'][ i] = record_file_df['file_number'][i] + 1
def match_tele_raw( input_dir, path_save, telemetry_status, start_time, end_time, telemetry_path='https://www.nefsc.noaa.gov/drifter/emolt.dat', accept_minutes_diff=20, acceptable_distance_diff=2, dpi=300): """ match the file and telementy. we can known how many file send to the satallite and output the figure """ #read the file of the telementry_status telemetrystatus_df = read_telemetrystatus(telemetry_status) #st the record file use to write minmum maxmum and average of depth and temperature,the numbers of file, telemetry and successfully matched record_file_df=telemetrystatus_df.loc[:,['Boat','Vessel#']].reindex(columns=['Boat','Vessel#','matched_number','file_number','tele_num','max_diff_depth',\ 'min_diff_depth','average_diff_depth','max_diff_temp','min_diff_temp','average_diff_temp','sum_diff_depth','sum_diff_temp',\ 'min_lat','max_lat','min_lon','max_lon'],fill_value=None) #transfer the time format of string to datetime start_time_local = datetime.strptime(start_time, '%Y-%m-%d') end_time_local = datetime.strptime(end_time, '%Y-%m-%d') allfile_lists = zl.list_all_files(input_dir) ###################### file_lists = [] for file in allfile_lists: if file[len(file) - 4:] == '.csv': file_lists.append(file) #download the data of telementry tele_df = read_telemetry(telemetry_path) #screen out the data of telemetry in interval valuable_tele_df = pd.DataFrame( data=None, columns=['vessel_n', 'esn', 'time', 'lon', 'lat', 'depth', 'temp']) #use to save the data during start time and end time for i in range(len(tele_df)): tele_time=datetime.strptime(str(tele_df['year'].iloc[i])+'-'+str(tele_df['month'].iloc[i])+'-'+str(tele_df['day'].iloc[i])+' '+\ str(tele_df['Hours'].iloc[i])+':'+str(tele_df['minates'].iloc[i])+':'+'00','%Y-%m-%d %H:%M:%S') if zl.local2utc(start_time_local) <= tele_time < zl.local2utc( end_time_local): valuable_tele_df=valuable_tele_df.append(pd.DataFrame(data=[[tele_df['vessel_n'][i],tele_df['esn'][i],tele_time,tele_df['lon'][i],tele_df['lat'][i],tele_df['depth'][i],\ tele_df['temp'][i]]],columns=['vessel_n','esn','time','lon','lat','depth','temp'])) valuable_tele_df.index = range(len(valuable_tele_df)) #whether the data of file and telemetry is exist if len(valuable_tele_df) == 0 and len(file_lists) == 0: print( 'please check the data website of telementry and the directory of raw_data is exist!' ) sys.exit() elif len(valuable_tele_df) == 0: print('please check the data website of telementry!') sys.exit() elif len(file_lists) == 0: print('please check the directory raw_data is exist!') sys.exit() #match the file index = telemetrystatus_df['Boat'] #set the index for dictionary raw_dict = { } #the dictinary about raw data, use to write the data about 'time','filename','mean_temp','mean_depth' tele_dict = { } #the dictionary about telementry data,use to write the data about'time','mean_temp','mean_depth' for i in range(len(index)): #loop every boat raw_dict[index[i]] = pd.DataFrame(data=None, columns=[ 'time', 'filename', 'mean_temp', 'mean_depth', 'mean_lat', 'mean_lon' ]) tele_dict[index[i]] = pd.DataFrame(data=None, columns=[ 'time', 'mean_temp', 'mean_depth', 'mean_lat', 'mean_lon' ]) for file in file_lists: # loop raw files fpath, fname = os.path.split(file) #get the file's path and name # now, read header and data of every file header_df = zl.nrows_len_to(file, 2, name=['key', 'value']) #only header data_df = zl.skip_len_to(file, 2) #only data #caculate the mean temperature and depth of every file value_data_df = data_df.ix[( data_df['Depth(m)'] > 0.85 * mean(data_df['Depth(m)']))] #filter the data value_data_df = value_data_df.ix[ 2:] #delay several minutes to let temperature sensor record the real bottom temp value_data_df=value_data_df.ix[(value_data_df['Temperature(C)']>mean(value_data_df['Temperature(C)'])-3*std(value_data_df['Temperature(C)'])) & \ (value_data_df['Temperature(C)']<mean(value_data_df['Temperature(C)'])+3*std(value_data_df['Temperature(C)']))] #Excluding gross error value_data_df.index = range(len(value_data_df)) #reindex for i in range(len(value_data_df['Lat'])): value_data_df['Lat'][i], value_data_df['Lon'][i] = cv.dm2dd( value_data_df['Lat'][i], value_data_df['Lon'][i]) min_lat = min(value_data_df['Lat'].values) max_lat = max(value_data_df['Lat'].values) min_lon = min(value_data_df['Lon'].values) max_lon = max(value_data_df['Lon'].values) mean_lat = str(round(mean(value_data_df['Lat'].values), 4)) mean_lon = str(round(mean(value_data_df['Lon'].values), 4)) #caculate the mean depth mean_temp = str( round(mean(value_data_df['Temperature(C)'][1:len(value_data_df)]), 2)) mean_depth = str( abs(int(round(mean(value_data_df['Depth(m)'].values))))).zfill( 3) #caculate the mean depth #get the vessel number of every file for i in range(len(header_df)): if header_df['key'][i].lower() == 'vessel number'.lower(): vessel_number = int(header_df['value'][i]) break #caculate the number of raw files in every vessel,and min,max of lat and lon for i in range(len(record_file_df)): if record_file_df['Vessel#'][i] == vessel_number: if record_file_df['file_number'].isnull()[i]: record_file_df['min_lat'][i] = min_lat record_file_df['max_lat'][i] = max_lat record_file_df['min_lon'][i] = min_lon record_file_df['max_lon'][i] = max_lon record_file_df['file_number'][i] = 1 else: record_file_df['file_number'][i] = int( record_file_df['file_number'][i] + 1) if record_file_df['min_lat'][i] > min_lat: record_file_df['min_lat'][i] = min_lat if record_file_df['max_lat'][i] < max_lat: record_file_df['max_lat'][i] = max_lat if record_file_df['min_lon'][i] > min_lon: record_file_df['min_lon'][i] = min_lon if record_file_df['max_lon'][i] < max_lon: record_file_df['max_lon'][i] = max_lon #match rawdata and telementry data time_str = fname.split('.')[0].split('_')[2] + ' ' + fname.split( '.')[0].split('_')[3] #GMT time to local time of file time_local = zl.gmt_to_eastern(time_str[0:4] + '-' + time_str[4:6] + '-' + time_str[6:8] + ' ' + time_str[9:11] + ':' + time_str[11:13] + ':' + time_str[13:15]) time_gmt = datetime.strptime(time_str, "%Y%m%d %H%M%S") #transfer the format latitude and longitude lat, lon = value_data_df['Lat'][ len(value_data_df) - 1], value_data_df['Lon'][len(value_data_df) - 1] #write the data of raw file to dict for i in range(len(telemetrystatus_df)): if telemetrystatus_df['Vessel#'][i] == vessel_number: raw_dict[telemetrystatus_df['Boat'][i]]=raw_dict[telemetrystatus_df['Boat'][i]].append(pd.DataFrame(data=[[time_local,\ fname,float(mean_temp),float(mean_depth),float(mean_lat),float(mean_lon)]],columns=['time','filename','mean_temp','mean_depth','mean_lat','mean_lon']).iloc[0],ignore_index=True) #caculate the numbers of successful matchs and the minimum,maximum and average different of temperature and depth, and write this data to record file for i in range(len(valuable_tele_df)): if valuable_tele_df['vessel_n'][i].split('_')[1] == str( vessel_number): if abs(valuable_tele_df['time'][i] - time_gmt) <= timedelta( minutes=accept_minutes_diff): #time match if zl.dist(lat1=lat, lon1=lon, lat2=float(valuable_tele_df['lat'][i]), lon2=float(valuable_tele_df['lon'][i]) ) <= acceptable_distance_diff: #distance match for j in range(len(record_file_df)): if record_file_df['Vessel#'][j] == vessel_number: diff_temp = round( (float(mean_temp) - float(valuable_tele_df['temp'][i])), 4) diff_depth = round( (float(mean_depth) - float(valuable_tele_df['depth'][i])), 4) if record_file_df['matched_number'].isnull( )[j]: record_file_df['matched_number'][j] = 1 record_file_df['sum_diff_temp'][ j] = diff_temp record_file_df['max_diff_temp'][ j] = diff_temp record_file_df['min_diff_temp'][ j] = diff_temp record_file_df['sum_diff_depth'][ j] = diff_depth record_file_df['max_diff_depth'][ j] = diff_depth record_file_df['min_diff_depth'][ j] = diff_depth break else: record_file_df['matched_number'][j] = int( record_file_df['matched_number'][j] + 1) record_file_df['sum_diff_temp'][ j] = record_file_df['sum_diff_temp'][ j] + diff_temp record_file_df['sum_diff_depth'][ j] = record_file_df['sum_diff_depth'][ j] + diff_depth if record_file_df['max_diff_temp'][ j] < diff_temp: record_file_df['max_diff_temp'][ j] = diff_temp if record_file_df['min_diff_temp'][ j] > diff_temp: record_file_df['min_diff_temp'][ j] = diff_temp if record_file_df['max_diff_depth'][ j] < diff_depth: record_file_df['max_diff_depth'][ j] = diff_depth if record_file_df['min_diff_depth'][ j] > diff_depth: record_file_df['min_diff_depth'][ j] = diff_depth break #write 'time','mean_temp','mean_depth' of the telementry to tele_dict for i in range( len(valuable_tele_df) ): #valuable_tele_df is the valuable telemetry data during start time and end time for j in range(len(telemetrystatus_df)): if int(valuable_tele_df['vessel_n'][i].split('_') [1]) == telemetrystatus_df['Vessel#'][j]: #count the numbers by boats if record_file_df['tele_num'].isnull()[j]: record_file_df['tele_num'][j] = 1 else: record_file_df['tele_num'][ j] = record_file_df['tele_num'][j] + 1 if record_file_df['max_lat'].isnull()[j]: record_file_df['min_lat'][j] = valuable_tele_df['lat'][i] record_file_df['max_lat'][j] = valuable_tele_df['lat'][i] record_file_df['min_lon'][j] = valuable_tele_df['lon'][i] record_file_df['max_lon'][j] = valuable_tele_df['lon'][i] else: if record_file_df['min_lat'][j] > valuable_tele_df['lat'][ i]: record_file_df['min_lat'][j] = valuable_tele_df['lat'][ i] if record_file_df['max_lat'][j] < valuable_tele_df['lat'][ i]: record_file_df['max_lat'][j] = valuable_tele_df['lat'][ i] if record_file_df['min_lon'][j] > valuable_tele_df['lon'][ i]: record_file_df['min_lon'][j] = valuable_tele_df['lon'][ i] if record_file_df['max_lon'][j] < valuable_tele_df['lon'][ i]: record_file_df['max_lon'][j] = valuable_tele_df['lon'][ i] #write 'time','mean_temp','mean_depth' of the telementry to tele_dict tele_dict[telemetrystatus_df['Boat'][j]]=tele_dict[telemetrystatus_df['Boat'][j]].append(pd.DataFrame(data=[[valuable_tele_df['time'][i],\ float(valuable_tele_df['temp'][i]),float(valuable_tele_df['depth'][i]),float(valuable_tele_df['lat'][i]),float(valuable_tele_df['lon'][i])]],columns=['time','mean_temp','mean_depth','mean_lat','mean_lon']).iloc[0],ignore_index=True) print("finish the calculate of min_lat and min_lon!") for i in range(len(record_file_df)): if not record_file_df['matched_number'].isnull()[i]: record_file_df['average_diff_depth'][i] = round( record_file_df['sum_diff_depth'][i] / record_file_df['matched_number'][i], 4) record_file_df['average_diff_temp'][i] = round( record_file_df['sum_diff_temp'][i] / record_file_df['matched_number'][i], 4) else: record_file_df['matched_number'][i] = 0 if record_file_df['tele_num'].isnull()[i]: record_file_df['tele_num'][i] = 0 if record_file_df['file_number'].isnull()[i]: record_file_df['file_number'][i] = 0 for i in index: #loop every boat, i represent the name of boat raw_dict[i] = raw_dict[i].sort_values(by=['time']) raw_dict[i].index = range(len(raw_dict[i])) record_file_df = record_file_df.drop(['sum_diff_depth', 'sum_diff_temp'], axis=1) #save the record file record_file_df.to_csv(path_save + '/' + start_time + '_' + end_time + ' statistics.csv', index=0) return raw_dict, tele_dict, record_file_df, index, start_time_local, end_time_local, path_save
file_lists = [] for file in allfile_lists: if file[len(file) - 4:] == '.csv': file_lists.append(file) #start check the data and save in the output_dir for file in file_lists: fpath, fname = os.path.split(file) #get the file's path and name #fix the file name fname = file.split('/')[len(file.split('/')) - 1] if len(fname.split('_') [1]) == 2: # if the serieal number is only 2 digits make it 4 new_fname = fname[:3] + Lowell_SN_2 + fname[3:] else: new_fname = fname df_head = zl.nrows_len_to(file, 2, name=['key', 'value']) #only read header df_data = zl.skip_len_to(file, 2) #only data #the standard data have 6 columns, sometimes the data possible lack of the column of the HEADING.If lack, fixed it if len(df_data.iloc[0] ) == 5: # some files didn't have the "DATA" in the first column df_data.insert(0, 'HEADING', 'DATA') #keep the lat and lon data format is right,such as 00000.0000w to 0000.0000 df_data.columns = [ 'HEADING', 'Datet(GMT)', 'Lat', 'Lon', 'Temperature(C)', 'Depth(m)' ] #rename the name of conlum of data for i in range(0, len(df_data)): if len(str(df_data['Lat'][i]).split('.')[0]) > 4 or 'A' <= str( df_data['Lat'][i] ).split('.')[1][len(str(df_data['Lat'][i]).split('.')[1]) - 1:] <= 'Z': df_data['Lat'][i] = str(df_data['Lat'][i]).split( '.')[0][len(str(df_data['Lat'][i]).split('.')[0]) -
def check_reformat_data(input_dir, output_dir, telemetry_status_file, raw_data_name_file, Lowell_SN_2='7a', similarity=0.7, mindepth=10): """ check reformat of data check:vessel name,vessel number,serial number, lat,lon add VP_NUM if the file is test file, continue next file """ #read the file of the vessel_number telemetrystatus_df = read_telemetrystatus(telemetry_status_file) raw_data_name_df = pd.read_csv(raw_data_name_file, sep='\t') #produce a dataframe that use to caculate the number of items total_df = pd.concat([ telemetrystatus_df.loc[:, ['Boat']][:], pd.DataFrame(data=[['Total']], columns=['Boat']) ], ignore_index=True) total_df.insert(1, 'file_total', 0) #get all the files under the input folder #screen out the file of '.csv',and put the path+name in the fil_lists allfile_lists = zl.list_all_files(input_dir) file_lists = [] for file in allfile_lists: if file[len(file) - 4:] == '.csv': file_lists.append(file) #start check the data and save in the output_dir for file in file_lists: fpath, fname = os.path.split(file) #get the file's path and name #fix the file name fname = file.split('/')[len(file.split('/')) - 1] if len(fname.split('_') [1]) == 2: # if the serieal number is only 2 digits make it 4 new_fname = fname[:3] + Lowell_SN_2 + fname[3:] else: new_fname = fname # now, read header and data try: df_head = zl.nrows_len_to(file, 2, name=['key', 'value']) df = zl.skip_len_to(file, 2) #data except: print("unvaluable file:" + file) continue #the standard data have 6 columns, sometimes the data possible lack of the column of the HEADING.If lack, fixed it if len(df.iloc[0] ) == 5: # some files didn't have the "DATA" in the first column df.insert(0, 'HEADING', 'DATA') df.columns = [ 'HEADING', 'Datet(GMT)', 'Lat', 'Lon', 'Temperature(C)', 'Depth(m)' ] #rename the name of conlum of data #keep the lat and lon data format is right,such as 00000.0000w to 0000.0000 for i in range(0, len(df['Lat'])): df['Lat'][i] = format_lat_lon(df['Lat'][i]) df['Lon'][i] = format_lat_lon(df['Lon'][i]) #check if the data is the test data; Is the vessel number right?(test data's vessel number is 99) df['Depth(m)'] = df['Depth(m)'].map( lambda x: '{0:.2f}'.format(float(x))) #keep two decimal fraction df['Temperature(C)'] = df['Temperature(C)'].map( lambda x: '{0:.2f}'.format(float(x))) df['Lon'] = df['Lon'].map(lambda x: '{0:.4f}'.format(float(x))) df['Lat'] = df['Lat'].map( lambda x: '{0:.4f}'.format(float(x))) #keep four decimal fraction count = 0 for i in range(len(df['Depth(m)']) ): #the value of count is 0 if the data is test data count = count + (float(df['Depth(m)'][i]) > mindepth ) # keep track of # of depths>mindepth if count > 5: break vessel_name = fpath.split('/')[len(fpath.split('/')) - 1:][0] #get the vessel name for j in range(len(df_head)): if df_head['key'][j].lower() == 'Vessel Number'.lower(): LOC_V_number = j #check and fix the vessel number if count != 0: for i in range(len(telemetrystatus_df)): if telemetrystatus_df['Vessel#'][i] == vessel_name: df_head['value'][j] = str( telemetrystatus_df['Vessel#'][i]) break else: continue else: df_head['value'][ j] = '99' #the value of the vessel number is 99 if the data is test data break if df_head['value'][LOC_V_number] == '99': df_head = df_head.replace(vessel_name, 'Test') print("test file:" + file) #if the file is test file,print it continue #check the header file whether exist or right,if not,repair it header_file_fixed_key = [ 'Date Format', 'Time Format', 'Temperature', 'Depth' ] header_file_fixed_value = ['YYYY-MM-DD', 'HH24:MI:SS', 'C', 'm'] loc = 0 EXIST = 0 for fixed_t in header_file_fixed_key: for k in range(len(df_head['key'])): if fixed_t.lower() == df_head['key'][k].lower(): break else: EXIST = 1 count = k + 1 if EXIST == 1: df_head = pd.concat([ df_head[:count], pd.DataFrame(data=[[fixed_t, header_file_fixed_value[loc]] ], columns=['key', 'value']) ], ignore_index=True) loc = loc + 1 #caculate the number of every vessel and boat files for i in range(len(total_df['Boat'])): if total_df['Boat'][i].lower() == vessel_name.lower(): total_df['file_total'][i] = total_df['file_total'][i] + 1 #if the vessel name and serial number are exist, find the location of them vessel_name_EXIST = 0 S_number_EXIST = 0 for k in range(len(df_head['key'])): if df_head['key'][k].lower() == 'Vessel Name'.lower(): vessel_name_EXIST = 1 df_head['value'][k] = vessel_name if df_head['key'][k].lower() == 'Serial Number'.lower(): if len(df_head['value'][k].split(':')) > 1: df_head['value'][k] = df_head['value'][k].replace(':', '') S_number_EXIST = 1 #check and fix the vessel name and serial number if S_number_EXIST == 0: df_head = pd.concat([ df_head[:1], pd.DataFrame(data=[['Serial Number', new_fname.split('_')[1]]], columns=['key', 'value']), df_head[1:] ], ignore_index=True) if vessel_name_EXIST == 0: # df_head = pd.concat([ df_head[:2], pd.DataFrame(data=[['Vessel Name', vessel_name]], columns=['key', 'value']), df_head[2:] ], ignore_index=True) for i in range(len(df_head['key'])): if df_head['key'][i].lower() == 'Vessel Number'.lower(): loc_vp_header = i + 1 break for i in range(len(raw_data_name_df['VESSEL_NAME'])): ratio = zl.str_similarity_ratio( vessel_name.lower(), raw_data_name_df['VESSEL_NAME'][i].lower()) ratio_best = 0 if ratio > similarity: if ratio > ratio_best: ratio_best = ratio loc_vp_file = i df_head = pd.concat([ df_head[:loc_vp_header], pd.DataFrame( data=[['VP_NUM', raw_data_name_df['VP_NUM'][loc_vp_file]]], columns=['key', 'value']), df_head[loc_vp_header:] ], ignore_index=True) #creat the path and name of the new_file and the temperature file output_path = fpath.replace(input_dir, output_dir) if not os.path.exists( output_path ): #check the path of the save file is exist,make it if not os.makedirs(output_path) df_head.to_csv(output_path + '/' + new_fname, index=0, header=0) df.to_csv(output_path + '/df_tem.csv', index=0) #produce the temperature file #add the two file in one file and delet the temperature file os.system('cat ' + output_path + '/df_tem.csv' + ' >> ' + output_path + '/' + new_fname) os.remove(output_path + '/df_tem.csv') # #caculate the total of all files and print save as a file. try: for i in range(len(total_df['file_total'])): total_df['file_total'][ len(total_df['file_total']) - 1] = total_df['file_total'][0] + total_df['file_total'][i] total_df.to_csv(output_dir + '/items_number.txt', index=0) except: print("no valuable file!")