def netcdf4_netcdf3_conv(chem): #############(D)Convert from netCDF4 to netCDF3_Classic ####### #input file dsin = ds("netcdf4/tno_ship_" + chem + ".nc") #output file dsout = ds("netcdf3/tno_ship_" + chem + ".nc", "w", format="NETCDF3_CLASSIC") #Copy dimensions for dname, the_dim in dsin.dimensions.iteritems(): print dname, len(the_dim) dsout.createDimension( dname, len(the_dim) if not the_dim.isunlimited() else None) # Copy variables for v_name, varin in dsin.variables.iteritems(): outVar = dsout.createVariable(v_name, varin.datatype, varin.dimensions) # print varin.datatype # Copy variable attributes outVar.setncatts({k: varin.getncattr(k) for k in varin.ncattrs()}) outVar[:] = varin[:] # close the output file dsout.close() # close the input file dsin.close() #Check netCDF version print dsout.file_format
def avgDataFilesGlobal(filedir, var, filetype, depth, unit_conv = -1, num_files = 10): results = glob('{0}/*{1}*'.format(filedir, filetype)) arr_tot = 0 for filename in results: nc_i = ds(filename, 'r+', format='NETCDF4') if depth == None: arr = nc_i[var][:] else: arr = nc_i[var][:][depth] if filetype == 'aijpc': area_arr = nc_i['axyp'][:] elif filetype == 'oijlpc': area_arr = nc_i['oxyp3'][:][depth] # Set area to zero in cells that have no value, excluding them from the average area_arr[arr.mask] = 0 if np.where(area_arr == 0)[0].size != 0: print('MASK CHECK', np.where(area_arr == 0)[0].size) # If an array has a mask, check to make sure area array is masked as well. arr_tot = arr_tot + arr arr_avg = (arr_tot * unit_conv) / num_files # # Used primarily for planetary albedo, masking area wherever there's no value (i.e. no sunlight) # area_arr[np.where(arr_avg==0)] = 0 # print(np.where(area_arr == 0)[0].size) # # if 'aqua' in filedir: arr_avg = np.roll(arr_avg, (arr_avg.shape[1]) // 2, axis=1) area_arr = np.roll(area_arr, (area_arr.shape[1]) // 2, axis=1) # Rolling the area so that masked values (i.e. for albedo) are rolled according to their coordinate # Rollling is necessary for determining side and substell averages return arr_avg, area_arr
def getPlanAlbFromSol(col, filetype = 'aijpc', num_files = 10): filedir = col['filedir'] results = glob('{0}/*{1}*'.format(filedir, filetype)) arr_tot = 0 for filename in results: nc_i = ds(filename, 'r+', format='NETCDF4') net_i = nc_i['srnf_toa'][:] inc_i = nc_i['incsw_toa'][:] out_i = inc_i - net_i albedo_i = (out_i / inc_i) * 100 arr_tot = arr_tot + albedo_i area_arr = nc_i['axyp'][:] area_arr[albedo_i.mask] = 0 print(np.where(area_arr == 0)[0].size) arr_avg = arr_tot / num_files if 'aqua' in filedir: arr_avg = np.roll(arr_avg, (arr_avg.shape[1]) // 2, axis=1) area_arr = np.roll(area_arr, (area_arr.shape[1]) // 2, axis=1) # Rolling the area so that masked values (i.e. for albedo) are rolled according to their coordinate # Rollling is necessary for determining side and substell averages plot_row = {'var':'plan_alb_calc', 'ylabel':'Calculated \n Planetary \n Albedo \n [%]', 'title':'Calculated Planetary Albedo', 'units':'[%]', 'lat':lat, 'lon':lon} title = col['title'] return arr_avg, area_arr, plot_row, title
def avgDataFilesLatLon(filedir, var, num_files, filetype, unit_conv, depth, avg_coord): results = glob('{0}/*{1}*'.format(filedir, filetype)) arr_tot = np.zeros((46, 72)) for filename in results: nc_i = ds(filename, 'r+', format='NETCDF4') if filetype == 'aijpc': area_arr = nc_i['axyp'][:] elif filetype == 'oijlpc': area_arr = nc_i['oxyp3'][:][depth] if depth == None: arr = nc_i[var][:] else: arr = nc_i[var][:][depth] arr_tot = arr_tot + arr arr_avg = (arr_tot * unit_conv) / num_files if len(arr_avg.shape) == 3: raise ( ValueError, "This array is 3D, so the axes you are averaging over are invalid." ) if 'aqua' in filedir: arr_avg = np.roll(arr_avg, (arr_avg.shape[1]) // 2, axis=1) if avg_coord == 'lat': avg_axis = 1 elif avg_coord == 'lon': avg_axis = 0 avg_arr = np.sum(arr_avg * area_arr, axis=avg_axis) / np.sum(area_arr, axis=avg_axis) return avg_arr
def iceGrowth(filedir, filename1, filename2): nc1 = ds(filedir + filename1, 'r+', format='NETCDF4') zsi1 = nc1['ZSI'][:] net_rad1 = nc1['net_rad_planet'][:] nc2 = ds(filedir + filename2, 'r+', format='NETCDF4') zsi2 = nc2['ZSI'][:] net_rad2 = nc2['net_rad_planet'][:] def getScale(arr1, arr2, div=True): arr1_max = np.max(np.abs(arr1)) arr2_max = np.max(np.abs(arr2)) tot_max = max(arr1_max, arr2_max) if div: tot_min = tot_max * -1 else: arr1_min = np.min(np.abs(arr1)) arr2_min = np.min(np.abs(arr2)) tot_min = min(arr1_min, arr2_min) return tot_min, tot_max fig, axes = plt.subplots(2, 2) ax1 = axes[0, 0] ax1.set_title('Ice Thickness Growth [m]') zsi_min, zsi_max = getScale(zsi1, zsi2) im1 = ax1.imshow(zsi1, cmap='Blues', vmin=0, vmax=zsi_max) fig.colorbar(im1, ax=ax1) ax2 = axes[1, 0] im2 = ax2.imshow(zsi2, cmap='Blues', vmin=0, vmax=zsi_max) fig.colorbar(im2, ax=ax2) ax3 = axes[0, 1] ax3.set_title('Net Radiation [Wm$^{-2}$]') rad_min, rad_max = getScale(net_rad1, net_rad2) im3 = ax3.imshow(net_rad1, cmap='seismic', vmin=rad_min, vmax=rad_max) fig.colorbar(im3, ax=ax3) ax4 = axes[1, 1] im4 = ax4.imshow(net_rad2, cmap='seismic', vmin=rad_min, vmax=rad_max) fig.colorbar(im4, ax=ax4) plt.tight_layout() plt.show()
def openNC(fdir, fname): """ Load the Dataset nc file. """ filename = '/home/haynes13/code/python/input_files/' + fdir + '/' + fname nc = ds(filename, 'r+', format='NETCDF4') #ssc = substellar continent print(nc) print('Dataset Loaded Successfully.') return nc
def oceanPotTemp(filename): nc = ds(filename, 'r+', format='NETCDF4') pot_temp = nc['pot_temp'][:] avg_pot_temp = np.array([]) for o_layer in pot_temp: avg_pot_temp = np.append(avg_pot_temp, np.mean(o_layer)) print('Avg Pot Temp (Celsius)') print(avg_pot_temp) return
def readdata(self): from netCDF4 import Dataset as ds from backend import varfuncs as vf fullpath=self.dataloc+self.datastr+'_0.50deg_reg_1950-1964_v14.0.nc' # add relavent varibles to structure self.lat=ds(fullpath,'r').variables['latitude'][:] self.longi=ds(fullpath,'r').variables['longitude'][:] (self.lat_index, lat_reduced) = vf.condi_ind(self.lat, self.lat_rng) #reduce the data to geographic regions of relavence (self.longi_index, longi_reduced) = vf.condi_ind(self.longi, self.longi_rng) self.time=ds(fullpath,'r').variables['time'] self.full=ds(fullpath,'r') # (begdateuse,fmt)=format_datestr(self.time.units) # get teh date of teh first data entry # dt = datetime.strptime(begdateuse, fmt).timetuple() self.jdstart=vf.conv2jd(self.time.units) #convert the dates into jullian day self.vari=ds(fullpath,'r').variables[self.datastr][:,self.lat_index,self.longi_index] #reduce teh main data matrix
def avgDataFiles(filedir, var, num_files=10): results = glob('{0}/*aijpc*'.format(filedir)) arr_tot = np.zeros((46, 72)) for filename in results: nc_i = ds(filename, 'r+', format='NETCDF4') arr = nc_i[var][:] arr_tot = arr_tot + arr arr_avg = arr_tot / num_files if 'aqua' in filedir: arr_avg = np.roll(arr_avg, (arr_avg.shape[1]) // 2, axis=1) return arr_avg
def avgDataFiles3D(filedir, var, num_files, filetype, unit_conv, depth): results = glob('{0}/*{1}*'.format(filedir, filetype)) arr_tot = np.zeros((46, 72)) for filename in results: nc_i = ds(filename, 'r+', format='NETCDF4') if depth == None: arr = nc_i[var][:] else: arr = nc_i[var][:][depth] arr_tot = arr_tot + arr arr_avg = (arr_tot * unit_conv) / num_files if 'aqua' in filedir: arr_avg = np.roll(arr_avg, (arr_avg.shape[1]) // 2, axis=1) return arr_avg
def avgDataFiles(filedir, filetype, var, unit_conv=1, num_files=10): results = glob('{0}/*{1}*'.format(filedir, filetype)) arr_tot = 0 for filename in results: nc_i = ds(filename, 'r+', format='NETCDF4') arr = nc_i[var][:] arr_tot = arr_tot + arr arr_avg = (arr_tot * unit_conv) / num_files if 'aqua' in filedir: #if it's aquaplanet simulation you need to roll so that substell point is in middle arr_avg = np.roll(arr_avg, (arr_avg.shape[2]) // 2, axis=2) if 'o' in filetype: #if it's ocean file, only take the top 5 levels arr_avg = arr_avg[:5, :, :] return arr_avg
def getHeightFile(filedir, filetype, num_files=10): results = glob('{0}/*{1}*'.format(filedir, filetype)) z_tot = 0 for filename in results: nc_i = ds(filename, 'r+', format='NETCDF4') z_i = nc_i['z'][:] z_tot = z_tot + z_i z_avg = z_tot / num_files if 'aqua' in filedir: #if it's aquaplanet simulation you need to roll so that substell point is in middle z_avg = np.roll(z_avg, (z_avg.shape[2]) // 2, axis=2) z_final = z_avg.reshape((z_avg.shape[0], -1)).mean(axis=1) return z_final
def main(): sentences = [] seq_num = 0 step_num = 0 assert feat_dim == 4 seqname_maxlength = 10 output_dim = len(tagdict) fi = open(INPUT_DATA) sen = [] for line in fi: line = line.strip() if (line == ""): if (len(sen) > 0): sentences.append(sen) sen = [] continue toks = line.split(" ") sen.append(toks) step_num += 1 seq_num = len(sentences) #print "step_num: %d"%step_num #print "seq_num: %d"%seq_num #define netcdf file nc = ds(OUTPUT_FILE, "w", format="NETCDF4") nc.createDimension("seq_num", seq_num) nc.createDimension("step_num", step_num) nc.createDimension("feat_dim", feat_dim) nc.createDimension("output_dim", output_dim) nc.createDimension("seqname_maxlength", seqname_maxlength) ncvar_seqnames = nc.createVariable("seqTags", "c", ("seq_num", "seqname_maxlength")) ncvar_seqlengths = nc.createVariable("seqLengths", "i4", ("seq_num")) ncvar_inputfeats = nc.createVariable("inputFeats", "f4", ("step_num", "feat_dim")) ncvar_inputwords = nc.createVariable("inputWords", "i4", ("step_num")) ncvar_outputlabels = nc.createVariable("outputLabels", "i4", ("step_num")) frame_index = 0 sen_index = 0 for seninfo in sentences: seqname = "%010d" % sen_index sample_num = len(seninfo) for i in range(sample_num): word = seninfo[i][0] pos = seninfo[i][1] chunk = seninfo[i][2] tag = seninfo[i][4] wordl = word.lower() wordid = getwordid(wordl) tagid = tagdict[tag] allcaps = word.isupper() initcap = word[0].isupper() hascap = False if (not allcaps): for w in word: if (w.isupper()): hascap = True break else: hascap = True ncvar_inputwords[frame_index] = wordid ncvar_inputfeats[frame_index, :] = numpy.zeros(4) if (allcaps): ncvar_inputfeats[frame_index, 0] = 1 elif (initcap): ncvar_inputfeats[frame_index, 1] = 1 elif (hascap): ncvar_inputfeats[frame_index, 2] = 1 else: ncvar_inputfeats[frame_index, 3] = 1 ncvar_outputlabels[frame_index] = tagid frame_index += 1 ncvar_seqnames[sen_index, 0:seqname_maxlength] = seqname ncvar_seqlengths[sen_index] = sample_num sen_index += 1 nc.close() print "wordcount: %d" % wordcount print "unkwordcount: %d" % unkwordcount print "oov rate: %f" % (float(unkwordcount) / wordcount)
#!/usr/bin/python #coding=utf-8 from netCDF4 import Dataset as ds import numpy as np import sys if (len(sys.argv) <= 1): f1 = "train.nc" else: f1 = sys.argv[1] print "checking netCDF file: " + f1 nc1 = ds(f1, 'r', format='NETCDF4') print "\n-------dimension-------\n" for dim in nc1.dimensions: print nc1.dimensions[dim] print "\n-------variable metainfo-------\n" for var in nc1.variables: print var + ":", print nc1.variables[var].ndim, #data dim print nc1.variables[var].shape #shape print "\n-------seq names-------\n" print nc1.variables["seqTags"][:] print "\n-------seq lengths-------\n" print nc1.variables["seqLengths"][:]
def main(): sentences = [] seq_num = 0 step_num = 0 filename_maxlength = 10 input_dim = 1 output_dim = len(tagdict) subdirs = os.listdir(WSJDATA_DIR) for subdir in subdirs: #basename=subdir.lstrip("0") dataid = int(subdir) if (dataid < fromdataid): continue if (dataid > todataid and todataid > 0): continue dirpath = WSJDATA_DIR + "/" + subdir #print subdir fs = os.listdir(dirpath) for f in fs: fp = dirpath + "/" + f fi = open(fp, "r") for line in fi: line = line.strip() if (line == ""): continue toks = line.split(" ") seninfo = [] for tok in toks: ts = tok.split("/") wordid = worddict[ts[0]] tagid = tagdict[ts[1]] seninfo.append((wordid, tagid)) sentences.append(seninfo) step_num += len(toks) fi.close() seq_num = len(sentences) #define netcdf file nc = ds(OUTPUT_FILE, "w", format="NETCDF4") nc.createDimension("numSeqs", seq_num) nc.createDimension("numTimesteps", step_num) nc.createDimension("inputPattSize", input_dim) nc.createDimension("numLabels", output_dim) nc.createDimension("maxSeqTagLength", filename_maxlength) ncvar_filenames = nc.createVariable("seqTags", "c", ("numSeqs", "maxSeqTagLength")) ncvar_samplenums = nc.createVariable("seqLengths", "i4", ("numSeqs")) ncvar_inputs = nc.createVariable("inputs", "i4", ("numTimesteps")) ncvar_outputs = nc.createVariable("targetClasses", "i4", ("numTimesteps")) frame_index = 0 sen_index = 0 for senwords in sentences: seqname = "%010d" % sen_index sample_num = len(senwords) for i in range(sample_num): wordid = senwords[i][0] tagid = senwords[i][1] ncvar_inputs[frame_index] = wordid ncvar_outputs[frame_index] = tagid frame_index += 1 ncvar_filenames[sen_index, 0:filename_maxlength] = seqname ncvar_samplenums[sen_index] = sample_num sen_index += 1 nc.close()
def main(): sentences=[] seq_num=0 step_num=0 filename_maxlength=10 input_dim=1 output_dim=len(tagdict) subdirs=os.listdir(WSJDATA_DIR) for subdir in subdirs: #basename=subdir.lstrip("0") dataid=int(subdir) if(dataid<fromdataid): continue if(dataid>todataid and todataid>0): continue dirpath=WSJDATA_DIR+"/"+subdir #print subdir fs=os.listdir(dirpath) for f in fs: fp=dirpath+"/"+f fi=open(fp,"r") for line in fi: line=line.strip() if(line==""): continue toks=line.split(" ") seninfo=[] for tok in toks: ts=tok.split("/") wordid=worddict[ts[0]] tagid=tagdict[ts[1]] seninfo.append((wordid,tagid)) sentences.append(seninfo) step_num+=len(toks) fi.close() seq_num=len(sentences) #define netcdf file nc=ds(OUTPUT_FILE,"w",format="NETCDF4") nc.createDimension("numSeqs",seq_num) nc.createDimension("numTimesteps",step_num) nc.createDimension("inputPattSize",input_dim) nc.createDimension("numLabels",output_dim) nc.createDimension("maxSeqTagLength",filename_maxlength) ncvar_filenames=nc.createVariable("seqTags","c",("numSeqs","maxSeqTagLength")) ncvar_samplenums=nc.createVariable("seqLengths","i4",("numSeqs")) ncvar_inputs=nc.createVariable("inputs","i4",("numTimesteps")) ncvar_outputs=nc.createVariable("targetClasses","i4",("numTimesteps")) frame_index=0 sen_index=0 for senwords in sentences: seqname="%010d"%sen_index sample_num=len(senwords) for i in range(sample_num): wordid=senwords[i][0] tagid=senwords[i][1] ncvar_inputs[frame_index]=wordid ncvar_outputs[frame_index]=tagid frame_index+=1 ncvar_filenames[sen_index,0:filename_maxlength]=seqname ncvar_samplenums[sen_index]=sample_num sen_index+=1 nc.close()
def main(): sentences=[] seq_num=0 step_num=0 assert feat_dim==5 seqname_maxlength=10 output_dim=len(tagdict) fi=open(INPUT_DATA) sen=[] for line in fi: line=line.strip() if(line==""): if(len(sen)>0): sentences.append(sen) sen=[] continue toks=line.split(" ") sen.append(toks) step_num+=1 seq_num=len(sentences) #print "step_num: %d"%step_num #print "seq_num: %d"%seq_num #define netcdf file nc=ds(OUTPUT_FILE,"w",format="NETCDF4") nc.createDimension("seq_num",seq_num) nc.createDimension("step_num",step_num) nc.createDimension("feat_dim",feat_dim) nc.createDimension("output_dim",output_dim) nc.createDimension("seqname_maxlength",seqname_maxlength) ncvar_seqnames=nc.createVariable("seqTags","c",("seq_num","seqname_maxlength")) ncvar_seqlengths=nc.createVariable("seqLengths","i4",("seq_num")) ncvar_inputfeats=nc.createVariable("inputFeats","f4",("step_num","feat_dim")) ncvar_inputwords=nc.createVariable("inputWords","i4",("step_num")) ncvar_outputlabels=nc.createVariable("outputLabels","i4",("step_num")) frame_index=0 sen_index=0 for seninfo in sentences: seqname="%010d"%sen_index sample_num=len(seninfo) for i in range(sample_num): word=seninfo[i][0] pos=seninfo[i][1] chunk=seninfo[i][2] tag=seninfo[i][4] wordl=word.lower() wordid=getwordid(wordl) tagid=tagdict[tag] if(word==wordl): allislower=1 else: allislower=0 wordu=word.upper() if(word==wordu): allisupper=1 else: allisupper=0 w1=word[0] w1u=w1.upper() if(w1==w1u and w1.isalpha()): firstisupper=1 else: firstisupper=0 if("#" in word): hasnum=1 else: hasnum=0 if("-" in word): hashypen=1 else: hashypen=0 ncvar_inputwords[frame_index]=wordid ncvar_inputfeats[frame_index,0]=allislower ncvar_inputfeats[frame_index,1]=allisupper ncvar_inputfeats[frame_index,2]=firstisupper ncvar_inputfeats[frame_index,3]=hasnum ncvar_inputfeats[frame_index,4]=hashypen ncvar_outputlabels[frame_index]=tagid frame_index+=1 ncvar_seqnames[sen_index,0:seqname_maxlength]=seqname ncvar_seqlengths[sen_index]=sample_num sen_index+=1 nc.close() print "wordcount: %d"%wordcount print "unkwordcount: %d"%unkwordcount print "oov rate: %f"%(float(unkwordcount)/wordcount)
def main(): #get basic information seq_num=0 step_num=0 seqname_maxlength=10 input_dim=len(wordid_dict) # 1 additional dim is for startflag feat_dim=1 output_dim=2 #additional tag ot represent the end tag starttagid=len(wordid_dict) #the start tag id fi=open(DATA_FILE,"r") for line in fi: line=line.strip() if(line==""): continue senwords=line.split(" ") step_num+=len(senwords)*2 seq_num+=2 fi.close() #define netcdf file nc=ds(output_file,"w",format="NETCDF4") nc.createDimension("seq_num",seq_num) nc.createDimension("step_num",step_num) nc.createDimension("feat_dim",feat_dim) nc.createDimension("output_dim",output_dim) nc.createDimension("seqname_maxlength",seqname_maxlength) ncvar_seqnames=nc.createVariable("seqTags","c",("seq_num","seqname_maxlength")) ncvar_seqlengths=nc.createVariable("seqLengths","i4",("seq_num")) ncvar_inputfeats=nc.createVariable("inputFeats","f4",("step_num","feat_dim")) ncvar_inputwords=nc.createVariable("inputWords","i4",("step_num")) ncvar_outputlabels=nc.createVariable("outputLabels","i4",("step_num")) frame_index=0 sen_index=0 fi=open(DATA_FILE,"r") for line in fi: line=line.strip() if(line==""): continue senwords=line.split(" ") seqname="%010d"%sen_index sample_num=len(senwords) #all correct sentence for i in range(sample_num): wordid=getwordid(senwords[i]) ncvar_inputwords[frame_index]=wordid ncvar_inputfeats[frame_index,0]=0 ncvar_outputlabels[frame_index]=1 frame_index+=1 ncvar_seqnames[sen_index,0:seqname_maxlength]=seqname ncvar_seqlengths[sen_index]=sample_num sen_index+=1 #with replaced error sentence for i in range(sample_num): r=random.random() wordid=-1 tagid=-1 if(r<replacerate): #replace with a random word, tag is 0 wordid=random.randint(0,maxwordid) tagid=0 else: # original word, tag is 1 wordid=getwordid(senwords[i]) tagid=1 ncvar_inputwords[frame_index]=wordid ncvar_inputfeats[frame_index,0]=0 ncvar_outputlabels[frame_index]=tagid frame_index+=1 ncvar_seqnames[sen_index,0:seqname_maxlength]=seqname ncvar_seqlengths[sen_index]=sample_num sen_index+=1 nc.close() fi.close() print "worddict size: %d"%len(wordid_dict) print "total_wordnum: %d"%total_wordnum print "unk_wordnum: %d"%unk_wordnum print "oov rate: %f"%(float(unk_wordnum)/total_wordnum)
for row in tsvreader: times.append(convert_time(row[0])) temps.append(convert_temp(row[1])) base_time = times[0] time_values = [] for t in times: value = t - base_time ts = value.total_seconds() time_values.append(ts) time_units = "seconds since " + base_time.strftime('%Y-%m-%d %H:%M:%S') dataset = ds(outfile, "w", format='NETCDF4_CLASSIC') time_dim = dataset.createDimension("time", None) time_var = dataset.createVariable("time", np.float64, ("time", )) time_var[:] = time_values time_var.units = time_units time_var.standard_name = "time" time_var.calendar = "standard" temp = dataset.createVariable("temp", np.float32, ("time", )) temp[:] = temps temp.var_id = "temp" temp.long_name = "Temperature of sensor (K)" temp.units = "K"
def main(): sentences=[] seq_num=0 step_num=0 seqname_maxlength=10 output_dim=len(tagdict) suffix2_dim=len(suffix2dict) assert feat_dim==suffix2_dim+1 subdirs=os.listdir(WSJDATA_DIR) for subdir in subdirs: #basename=subdir.lstrip("0") dataid=int(subdir) if(dataid<fromdataid): continue if(dataid>todataid and todataid>0): continue dirpath=WSJDATA_DIR+"/"+subdir #print subdir fs=os.listdir(dirpath) for f in fs: fp=dirpath+"/"+f fi=open(fp,"r") for line in fi: line=line.strip() if(line==""): continue toks=line.split(" ") seninfo=[] for tok in toks: ts=tok.split("/") word=ts[0] wordl=word.lower() if(word==wordl): lower_flag=0 else: lower_flag=1 # word contains capital letters suffix2="none" if(len(wordl)>=2): suffix2=wordl[-2:] suffix2id=suffix2dict[suffix2] wordid=getwordid(wordl) tagid=tagdict[ts[1]] seninfo.append((wordid,tagid,lower_flag,suffix2id)) sentences.append(seninfo) step_num+=len(toks) fi.close() seq_num=len(sentences) #define netcdf file nc=ds(OUTPUT_FILE,"w",format="NETCDF4") nc.createDimension("seq_num",seq_num) nc.createDimension("step_num",step_num) nc.createDimension("feat_dim",feat_dim) nc.createDimension("output_dim",output_dim) nc.createDimension("seqname_maxlength",seqname_maxlength) ncvar_seqnames=nc.createVariable("seqTags","c",("seq_num","seqname_maxlength")) ncvar_seqlengths=nc.createVariable("seqLengths","i4",("seq_num")) ncvar_inputfeats=nc.createVariable("inputFeats","f4",("step_num","feat_dim")) ncvar_inputwords=nc.createVariable("inputWords","i4",("step_num")) ncvar_outputlabels=nc.createVariable("outputLabels","i4",("step_num")) frame_index=0 sen_index=0 for senwords in sentences: seqname="%010d"%sen_index sample_num=len(senwords) for i in range(sample_num): wordid=senwords[i][0] tagid=senwords[i][1] lower_flag=senwords[i][2] suffix2id=senwords[i][3] ncvar_inputwords[frame_index]=wordid ncvar_inputfeats[frame_index,:]=numpy.zeros(feat_dim) ncvar_inputfeats[frame_index,0]=lower_flag ncvar_inputfeats[frame_index,1+suffix2id]=1 ncvar_outputlabels[frame_index]=tagid frame_index+=1 ncvar_seqnames[sen_index,0:seqname_maxlength]=seqname ncvar_seqlengths[sen_index]=sample_num sen_index+=1 nc.close() print "word num: %d"%wn print "unk word num: %d"%unk_wn print "oov rate: %f"%(float(unk_wn)/wn)
def main(): sentences=[] seq_num=0 step_num=0 seqname_maxlength=10 output_dim=len(tagdict) fi=open(INPUT_DATA) sen=[] for line in fi: line=line.strip() if(line==""): if(len(sen)>0): sentences.append(sen) sen=[] continue toks=line.split(" ") sen.append(toks) step_num+=1 seq_num=len(sentences) #print "step_num: %d"%step_num #print "seq_num: %d"%seq_num #define netcdf file nc=ds(OUTPUT_FILE,"w",format="NETCDF4") nc.createDimension("seq_num",seq_num) nc.createDimension("step_num",step_num) nc.createDimension("feat_dim",feat_dim) nc.createDimension("output_dim",output_dim) nc.createDimension("seqname_maxlength",seqname_maxlength) ncvar_seqnames=nc.createVariable("seqTags","c",("seq_num","seqname_maxlength")) ncvar_seqlengths=nc.createVariable("seqLengths","i4",("seq_num")) ncvar_inputfeats=nc.createVariable("inputFeats","f4",("step_num","feat_dim")) ncvar_inputwords=nc.createVariable("inputWords","i4",("step_num")) ncvar_outputlabels=nc.createVariable("outputLabels","i4",("step_num")) frame_index=0 sen_index=0 for seninfo in sentences: seqname="%010d"%sen_index sample_num=len(seninfo) for i in range(sample_num): word=seninfo[i][0] pos=seninfo[i][1] tag=seninfo[i][2] wordid=getwordid(word) if(tag=="I-LST"): tag="O" tagid=tagdict[tag] lower_flag=getlowerflag(word) ncvar_inputwords[frame_index]=wordid ncvar_inputfeats[frame_index,0]=lower_flag ncvar_outputlabels[frame_index]=tagid frame_index+=1 ncvar_seqnames[sen_index,0:seqname_maxlength]=seqname ncvar_seqlengths[sen_index]=sample_num sen_index+=1 nc.close() print "wordcount: %d"%wordcount print "unkwordcount: %d"%unkwordcount print "oov rate: %f"%(float(unkwordcount)/wordcount)
import xray from monthdelta import monthdelta from datetime import datetime from netCDF4 import num2date, date2num dates = [] ############(A) Open original TNO emissions inventory################ #tno_2011 = 'path to the TNO_MACCII' #input file tno_2011 = '/mnt/raid/wrf-chem/emis/WRF_EMIS_UoM_UEA/preprocessor_script/TNO_MACC/TNO_MACC_III_emissions_2011.nc' tno = nc.Dataset(tno_2011,'r') #######################(B)Create new netcdf file################################# #1) Create an empty NetCDF file #dataset = ds('Path_directory_to_place_the_file','w','r', format='NETCDF4_CLASSIC') #output file dataset = ds('/mnt/raid/wrf-chem/emis/WRF_EMIS_UoM_UEA/preprocessor_script/OUT/CO_tno_out.nc','w','r', format='NETCDF4_CLASSIC') #output file #2)Create the dimensions lat = dataset.createDimension('lat', 672) #number of latitudes lon = dataset.createDimension('lon', 720) #number of longitudes time = dataset.createDimension('time', None) #3)Create variables ######1D variables first###### lat = dataset.createVariable('lat',np.float32, ('lat'), fill_value=False) lon = dataset.createVariable('lon',np.float32, ('lon'), fill_value=False) time = dataset.createVariable('time',np.float32, ('time'), fill_value=False) ######3D variables (SNAP sectors) ########
def create_new_netcdf_file(file): dataset = ds(file, 'w', 'r', format='NETCDF4_CLASSIC') #2)Create the dimensions lat = dataset.createDimension('lat', 672) #number of latitudes lon = dataset.createDimension('lon', 720) #number of longitudes time = dataset.createDimension('time', None) #3)Create variables ######1D variables first###### lat = dataset.createVariable('lat', np.float32, ('lat'), fill_value=False) lon = dataset.createVariable('lon', np.float32, ('lon'), fill_value=False) time = dataset.createVariable('time', np.float32, ('time'), fill_value=False) ######3D variables (SNAP sectors) ######## pow = dataset.createVariable('pow', np.float64, ('time', 'lat', 'lon'), fill_value=False) res = dataset.createVariable('res', np.float32, ('time', 'lat', 'lon'), fill_value=False) inc = dataset.createVariable('inc', np.float32, ('time', 'lat', 'lon'), fill_value=False) pei = dataset.createVariable('pei', np.float32, ('time', 'lat', 'lon'), fill_value=False) exf = dataset.createVariable('exf', np.float32, ('time', 'lat', 'lon'), fill_value=False) sol = dataset.createVariable('sol', np.float32, ('time', 'lat', 'lon'), fill_value=False) tra1 = dataset.createVariable('tra1', np.float32, ('time', 'lat', 'lon'), fill_value=False) tra2 = dataset.createVariable('tra2', np.float32, ('time', 'lat', 'lon'), fill_value=False) tra3 = dataset.createVariable('tra3', np.float32, ('time', 'lat', 'lon'), fill_value=False) tra4 = dataset.createVariable('tra4', np.float32, ('time', 'lat', 'lon'), fill_value=False) tra5 = dataset.createVariable('tra5', np.float32, ('time', 'lat', 'lon'), fill_value=False) nrt = dataset.createVariable('nrt', np.float32, ('time', 'lat', 'lon'), fill_value=False) was = dataset.createVariable('was', np.float32, ('time', 'lat', 'lon'), fill_value=False) agr = dataset.createVariable('agr', np.float32, ('time', 'lat', 'lon'), fill_value=False) #4) Add attributes units to 1D and 3D variables lat.units = 'degrees_north' lat.long_name = 'latitude' lon.units = 'degrees_east' lon.long_name = 'longitude' time.units = 'days since 1900-01-01 00:00' time.calendar = 'gregorian' time.long_name = 'Time' pow.units = 'Kg yr-1' pow.long_name = 'Power generation' res.units = 'Kg yr-1' res.long_name = 'Residential, comercial and other combustion' inc.units = 'Kg yr-1' inc.long_name = 'Industrial combustion' pei.units = 'Kg yr-1' pei.long_name = 'Processed emission industrial' exf.units = 'Kg yr-1' exf.long_name = 'Extraction and distribution of fossil fuels' sol.units = 'Kg yr-1' sol.long_name = 'Solvent use' tra1.units = 'Kg yr-1' tra1.long_name = 'Road transport, gasoline' tra2.units = 'Kg yr-1' tra2.long_name = 'Road transport, diesel' tra3.units = 'Kg yr-1' tra3.long_name = 'Road trasnport, LPG' tra4.units = 'Kg yr-1' tra4.long_name = 'Road trasnport, non-exhaust, volatilisation' tra5.units = 'Kg yr-1' tra5.long_name = 'Road transport, non-exhaust, wear' nrt.units = 'Kg yr-1' nrt.long_name = 'Non-road transport' was.units = 'Kg yr-1' was.long_name = 'Waste tratment and disposal' agr.units = 'Kg yr-1' agr.long_name = 'Agriculture' return dataset
def append_lat_lon_data(lat_tno, long_tno, file): file['lat'][:] = lat_tno file['lon'][:] = long_tno # append to the data files append_lat_lon_data(latitude_tno, longitude_tno, ds_bc1) append_lat_lon_data(latitude_tno, longitude_tno, ds_ec25) append_lat_lon_data(latitude_tno, longitude_tno, ds_ec10) append_lat_lon_data(latitude_tno, longitude_tno, ds_oc25) append_lat_lon_data(latitude_tno, longitude_tno, ds_oc10) append_lat_lon_data(latitude_tno, longitude_tno, ds_pm25) append_lat_lon_data(latitude_tno, longitude_tno, ds_pm10) #2)Extract PM2_5 and PM10 using the NETCDF4 library######## tno_net = ds(tno_2011) emis_cat_index = tno_net.variables['emission_category_index'][:] lat_index = tno_net.variables['latitude_index'][:] lon_index = tno_net.variables['longitude_index'][:] pm25_data = tno_net.variables['pm2_5'][:] pm10_data = tno_net.variables['pm10'][:] ### get the country ID info from NETCDF4 library country_index = tno_net.variables['country_index'][:] country_id = tno_net.variables['country_id'][:] #3)Loop through every emission category (emiss_cat) and pick the emissions values by sector, latitude, and longitude ##### create a list of the sectors ## NOTE: this must be the same order as the sectors in the source file!!!!!!! ##### i) Create a 3D array with zeros pm25_arrays = np.zeros(shape=(13, 12, 672, 720))
def main(): #get basic information seq_num = 0 step_num = 0 seqname_maxlength = 10 input_dim = len(wordid_dict) # 1 additional dim is for startflag feat_dim = 1 output_dim = 2 #additional tag ot represent the end tag starttagid = len(wordid_dict) #the start tag id fi = open(DATA_FILE, "r") for line in fi: line = line.strip() if (line == ""): continue senwords = line.split(" ") step_num += len(senwords) seq_num += 1 fi.close() #define netcdf file nc = ds(output_file, "w", format="NETCDF4") nc.createDimension("seq_num", seq_num) nc.createDimension("step_num", step_num) nc.createDimension("feat_dim", feat_dim) nc.createDimension("output_dim", output_dim) nc.createDimension("seqname_maxlength", seqname_maxlength) ncvar_seqnames = nc.createVariable("seqTags", "c", ("seq_num", "seqname_maxlength")) ncvar_seqlengths = nc.createVariable("seqLengths", "i4", ("seq_num")) ncvar_inputfeats = nc.createVariable("inputFeats", "f4", ("step_num", "feat_dim")) ncvar_inputwords = nc.createVariable("inputWords", "i4", ("step_num")) ncvar_outputlabels = nc.createVariable("outputLabels", "i4", ("step_num")) frame_index = 0 sen_index = 0 fi = open(DATA_FILE, "r") for line in fi: line = line.strip() if (line == ""): continue senwords = line.split(" ") seqname = "%010d" % sen_index sample_num = len(senwords) for i in range(sample_num): r = random.random() wordid = -1 tagid = -1 if (r < replacerate): #replace with a random word, tag is 0 wordid = random.randint(0, maxwordid) tagid = 0 else: # original word, tag is 1 wordid = getwordid(senwords[i]) tagid = 1 ncvar_inputwords[frame_index] = wordid ncvar_inputfeats[frame_index, 0] = 0 ncvar_outputlabels[frame_index] = tagid frame_index += 1 ncvar_seqnames[sen_index, 0:seqname_maxlength] = seqname ncvar_seqlengths[sen_index] = sample_num sen_index += 1 nc.close() fi.close() print "worddict size: %d" % len(wordid_dict) print "total_wordnum: %d" % total_wordnum print "unk_wordnum: %d" % unk_wordnum print "oov rate: %f" % (float(unk_wordnum) / total_wordnum)
import numpy as np import xray from monthdelta import monthdelta from datetime import datetime from netCDF4 import num2date, date2num dates = [] ############(A) Open original TNO emissions inventory################ tno_2011 = 'path to the TNO_MACCII' #input file tno = nc.Dataset(tno_2011, 'r') #######################(B)Create new netcdf file################################# #1) Create an empty NetCDF file dataset = ds('Path_directory_to_place_the_file', 'w', 'r', format='NETCDF4_CLASSIC') #output file #2)Create the dimensions lat = dataset.createDimension('lat', 672) #number of latitudes lon = dataset.createDimension('lon', 720) #number of longitudes time = dataset.createDimension('time', None) #3)Create variables ######1D variables first###### lat = dataset.createVariable('lat', np.float32, ('lat'), fill_value=False) lon = dataset.createVariable('lon', np.float32, ('lon'), fill_value=False) time = dataset.createVariable('time', np.float32, ('time'), fill_value=False)
def main(): sentences=[] seq_num=0 step_num=0 seqname_maxlength=10 output_dim=len(tagdict) subdirs=os.listdir(WSJDATA_DIR) for subdir in subdirs: #basename=subdir.lstrip("0") dataid=int(subdir) if(dataid<fromdataid): continue if(dataid>todataid and todataid>0): continue dirpath=WSJDATA_DIR+"/"+subdir #print subdir fs=os.listdir(dirpath) for f in fs: fp=dirpath+"/"+f fi=open(fp,"r") for line in fi: line=line.strip() if(line==""): continue toks=line.split(" ") seninfo=[] for tok in toks: ts=tok.split("/") word=ts[0] tag=ts[1] seninfo.append((word,tag)) sentences.append(seninfo) step_num+=len(toks) fi.close() seq_num=len(sentences) #define netcdf file nc=ds(OUTPUT_FILE,"w",format="NETCDF4") nc.createDimension("seq_num",seq_num) nc.createDimension("step_num",step_num) nc.createDimension("feat_dim",feat_dim) nc.createDimension("output_dim",output_dim) nc.createDimension("seqname_maxlength",seqname_maxlength) ncvar_seqnames=nc.createVariable("seqTags","c",("seq_num","seqname_maxlength")) ncvar_seqlengths=nc.createVariable("seqLengths","i4",("seq_num")) ncvar_inputfeats=nc.createVariable("inputFeats","f4",("step_num","feat_dim")) ncvar_inputwords=nc.createVariable("inputWords","i4",("step_num")) ncvar_outputlabels=nc.createVariable("outputLabels","i4",("step_num")) frame_index=0 sen_index=0 for senwords in sentences: seqname="%010d"%sen_index sample_num=len(senwords) for i in range(sample_num): word=senwords[i][0] tag=senwords[i][1] wordl=word.lower() wordid=getwordid(wordl) tagid=tagdict[tag] if(word==wordl): allislower=1 else: allislower=0 wordu=word.upper() if(word==wordu): allisupper=1 else: allisupper=0 w1=word[0] w1u=w1.upper() if(w1==w1u and w1.isalpha()): firstisupper=1 else: firstisupper=0 ncvar_inputwords[frame_index]=wordid ncvar_inputfeats[frame_index,0]=allislower ncvar_inputfeats[frame_index,1]=allisupper ncvar_inputfeats[frame_index,2]=firstisupper ncvar_outputlabels[frame_index]=tagid frame_index+=1 ncvar_seqnames[sen_index,0:seqname_maxlength]=seqname ncvar_seqlengths[sen_index]=sample_num sen_index+=1 nc.close() print "word num: %d"%wn print "unk word num: %d"%unk_wn print "oov rate: %f"%(float(unk_wn)/wn)
def main(): sentences = [] seq_num = 0 step_num = 0 seqname_maxlength = 10 output_dim = len(tagdict) fi = open(INPUT_DATA) sen = [] for line in fi: line = line.strip() if (line == ""): if (len(sen) > 0): sentences.append(sen) sen = [] continue toks = line.split(" ") sen.append(toks) step_num += 1 seq_num = len(sentences) #print "step_num: %d"%step_num #print "seq_num: %d"%seq_num #define netcdf file nc = ds(OUTPUT_FILE, "w", format="NETCDF4") nc.createDimension("seq_num", seq_num) nc.createDimension("step_num", step_num) nc.createDimension("feat_dim", feat_dim) nc.createDimension("output_dim", output_dim) nc.createDimension("seqname_maxlength", seqname_maxlength) ncvar_seqnames = nc.createVariable("seqTags", "c", ("seq_num", "seqname_maxlength")) ncvar_seqlengths = nc.createVariable("seqLengths", "i4", ("seq_num")) ncvar_inputfeats = nc.createVariable("inputFeats", "f4", ("step_num", "feat_dim")) ncvar_inputwords = nc.createVariable("inputWords", "i4", ("step_num")) ncvar_outputlabels = nc.createVariable("outputLabels", "i4", ("step_num")) frame_index = 0 sen_index = 0 for seninfo in sentences: seqname = "%010d" % sen_index sample_num = len(seninfo) for i in range(sample_num): word = seninfo[i][0] pos = seninfo[i][1] tag = seninfo[i][2] wordid = getwordid(word) if (tag == "I-LST"): tag = "O" tagid = tagdict[tag] lower_flag = getlowerflag(word) ncvar_inputwords[frame_index] = wordid ncvar_inputfeats[frame_index, 0] = lower_flag ncvar_outputlabels[frame_index] = tagid frame_index += 1 ncvar_seqnames[sen_index, 0:seqname_maxlength] = seqname ncvar_seqlengths[sen_index] = sample_num sen_index += 1 nc.close() print "wordcount: %d" % wordcount print "unkwordcount: %d" % unkwordcount print "oov rate: %f" % (float(unkwordcount) / wordcount)
def create_ship_emissions(chem): #######################(B)Create new netcdf file################################# #1) Create an empty NetCDF file dataset = ds('netcdf4/tno_ship_' + chem + '.nc', 'w', 'r', format='NETCDF4_CLASSIC') #output file #2)Create the dimensions #lat = dataset.createDimension('lat', 436) #number of latitudes #lon = dataset.createDimension('lon', 1442) #number of longitudes lat = dataset.createDimension('lat', 672) #number of latitudes lon = dataset.createDimension('lon', 720) #number of longitudes time = dataset.createDimension('time', None) #3)Create variables ######1D variables first###### lat = dataset.createVariable('lat', np.float32, ('lat'), fill_value=False) lon = dataset.createVariable('lon', np.float32, ('lon'), fill_value=False) time = dataset.createVariable('time', np.float32, ('time'), fill_value=False) ######3D variables (SNAP sectors) ######## ships = dataset.createVariable('ships', np.float32, ('time', 'lat', 'lon'), fill_value=False) #4) Add attributes units to 1D and 3D variables lat.units = 'degrees_north' lat.long_name = 'latitude' lon.units = 'degrees_east' lon.long_name = 'longitude' time.units = 'days since 1900-01-01 00:00' time.calendar = 'gregorian' time.long_name = 'Time' ships.units = 'Tg?' ships.long_name = 'International shipping' # setting the latitude and longitude data # we should change this to a more sensible calculation when we have time... lat = [ 30.03125, 30.09375, 30.15625, 30.21875, 30.28125, 30.34375, 30.40625, 30.46875, 30.53125, 30.59375, 30.65625, 30.71875, 30.78125, 30.84375, 30.90625, 30.96875, 31.03125, 31.09375, 31.15625, 31.21875, 31.28125, 31.34375, 31.40625, 31.46875, 31.53125, 31.59375, 31.65625, 31.71875, 31.78125, 31.84375, 31.90625, 31.96875, 32.03125, 32.09375, 32.15625, 32.21875, 32.28125, 32.34375, 32.40625, 32.46875, 32.53125, 32.59375, 32.65625, 32.71875, 32.78125, 32.84375, 32.90625, 32.96875, 33.03125, 33.09375, 33.15625, 33.21875, 33.28125, 33.34375, 33.40625, 33.46875, 33.53125, 33.59375, 33.65625, 33.71875, 33.78125, 33.84375, 33.90625, 33.96875, 34.03125, 34.09375, 34.15625, 34.21875, 34.28125, 34.34375, 34.40625, 34.46875, 34.53125, 34.59375, 34.65625, 34.71875, 34.78125, 34.84375, 34.90625, 34.96875, 35.03125, 35.09375, 35.15625, 35.21875, 35.28125, 35.34375, 35.40625, 35.46875, 35.53125, 35.59375, 35.65625, 35.71875, 35.78125, 35.84375, 35.90625, 35.96875, 36.03125, 36.09375, 36.15625, 36.21875, 36.28125, 36.34375, 36.40625, 36.46875, 36.53125, 36.59375, 36.65625, 36.71875, 36.78125, 36.84375, 36.90625, 36.96875, 37.03125, 37.09375, 37.15625, 37.21875, 37.28125, 37.34375, 37.40625, 37.46875, 37.53125, 37.59375, 37.65625, 37.71875, 37.78125, 37.84375, 37.90625, 37.96875, 38.03125, 38.09375, 38.15625, 38.21875, 38.28125, 38.34375, 38.40625, 38.46875, 38.53125, 38.59375, 38.65625, 38.71875, 38.78125, 38.84375, 38.90625, 38.96875, 39.03125, 39.09375, 39.15625, 39.21875, 39.28125, 39.34375, 39.40625, 39.46875, 39.53125, 39.59375, 39.65625, 39.71875, 39.78125, 39.84375, 39.90625, 39.96875, 40.03125, 40.09375, 40.15625, 40.21875, 40.28125, 40.34375, 40.40625, 40.46875, 40.53125, 40.59375, 40.65625, 40.71875, 40.78125, 40.84375, 40.90625, 40.96875, 41.03125, 41.09375, 41.15625, 41.21875, 41.28125, 41.34375, 41.40625, 41.46875, 41.53125, 41.59375, 41.65625, 41.71875, 41.78125, 41.84375, 41.90625, 41.96875, 42.03125, 42.09375, 42.15625, 42.21875, 42.28125, 42.34375, 42.40625, 42.46875, 42.53125, 42.59375, 42.65625, 42.71875, 42.78125, 42.84375, 42.90625, 42.96875, 43.03125, 43.09375, 43.15625, 43.21875, 43.28125, 43.34375, 43.40625, 43.46875, 43.53125, 43.59375, 43.65625, 43.71875, 43.78125, 43.84375, 43.90625, 43.96875, 44.03125, 44.09375, 44.15625, 44.21875, 44.28125, 44.34375, 44.40625, 44.46875, 44.53125, 44.59375, 44.65625, 44.71875, 44.78125, 44.84375, 44.90625, 44.96875, 45.03125, 45.09375, 45.15625, 45.21875, 45.28125, 45.34375, 45.40625, 45.46875, 45.53125, 45.59375, 45.65625, 45.71875, 45.78125, 45.84375, 45.90625, 45.96875, 46.03125, 46.09375, 46.15625, 46.21875, 46.28125, 46.34375, 46.40625, 46.46875, 46.53125, 46.59375, 46.65625, 46.71875, 46.78125, 46.84375, 46.90625, 46.96875, 47.03125, 47.09375, 47.15625, 47.21875, 47.28125, 47.34375, 47.40625, 47.46875, 47.53125, 47.59375, 47.65625, 47.71875, 47.78125, 47.84375, 47.90625, 47.96875, 48.03125, 48.09375, 48.15625, 48.21875, 48.28125, 48.34375, 48.40625, 48.46875, 48.53125, 48.59375, 48.65625, 48.71875, 48.78125, 48.84375, 48.90625, 48.96875, 49.03125, 49.09375, 49.15625, 49.21875, 49.28125, 49.34375, 49.40625, 49.46875, 49.53125, 49.59375, 49.65625, 49.71875, 49.78125, 49.84375, 49.90625, 49.96875, 50.03125, 50.09375, 50.15625, 50.21875, 50.28125, 50.34375, 50.40625, 50.46875, 50.53125, 50.59375, 50.65625, 50.71875, 50.78125, 50.84375, 50.90625, 50.96875, 51.03125, 51.09375, 51.15625, 51.21875, 51.28125, 51.34375, 51.40625, 51.46875, 51.53125, 51.59375, 51.65625, 51.71875, 51.78125, 51.84375, 51.90625, 51.96875, 52.03125, 52.09375, 52.15625, 52.21875, 52.28125, 52.34375, 52.40625, 52.46875, 52.53125, 52.59375, 52.65625, 52.71875, 52.78125, 52.84375, 52.90625, 52.96875, 53.03125, 53.09375, 53.15625, 53.21875, 53.28125, 53.34375, 53.40625, 53.46875, 53.53125, 53.59375, 53.65625, 53.71875, 53.78125, 53.84375, 53.90625, 53.96875, 54.03125, 54.09375, 54.15625, 54.21875, 54.28125, 54.34375, 54.40625, 54.46875, 54.53125, 54.59375, 54.65625, 54.71875, 54.78125, 54.84375, 54.90625, 54.96875, 55.03125, 55.09375, 55.15625, 55.21875, 55.28125, 55.34375, 55.40625, 55.46875, 55.53125, 55.59375, 55.65625, 55.71875, 55.78125, 55.84375, 55.90625, 55.96875, 56.03125, 56.09375, 56.15625, 56.21875, 56.28125, 56.34375, 56.40625, 56.46875, 56.53125, 56.59375, 56.65625, 56.71875, 56.78125, 56.84375, 56.90625, 56.96875, 57.03125, 57.09375, 57.15625, 57.21875, 57.28125, 57.34375, 57.40625, 57.46875, 57.53125, 57.59375, 57.65625, 57.71875, 57.78125, 57.84375, 57.90625, 57.96875, 58.03125, 58.09375, 58.15625, 58.21875, 58.28125, 58.34375, 58.40625, 58.46875, 58.53125, 58.59375, 58.65625, 58.71875, 58.78125, 58.84375, 58.90625, 58.96875, 59.03125, 59.09375, 59.15625, 59.21875, 59.28125, 59.34375, 59.40625, 59.46875, 59.53125, 59.59375, 59.65625, 59.71875, 59.78125, 59.84375, 59.90625, 59.96875, 60.03125, 60.09375, 60.15625, 60.21875, 60.28125, 60.34375, 60.40625, 60.46875, 60.53125, 60.59375, 60.65625, 60.71875, 60.78125, 60.84375, 60.90625, 60.96875, 61.03125, 61.09375, 61.15625, 61.21875, 61.28125, 61.34375, 61.40625, 61.46875, 61.53125, 61.59375, 61.65625, 61.71875, 61.78125, 61.84375, 61.90625, 61.96875, 62.03125, 62.09375, 62.15625, 62.21875, 62.28125, 62.34375, 62.40625, 62.46875, 62.53125, 62.59375, 62.65625, 62.71875, 62.78125, 62.84375, 62.90625, 62.96875, 63.03125, 63.09375, 63.15625, 63.21875, 63.28125, 63.34375, 63.40625, 63.46875, 63.53125, 63.59375, 63.65625, 63.71875, 63.78125, 63.84375, 63.90625, 63.96875, 64.03125, 64.09375, 64.15625, 64.21875, 64.28125, 64.34375, 64.40625, 64.46875, 64.53125, 64.59375, 64.65625, 64.71875, 64.78125, 64.84375, 64.90625, 64.96875, 65.03125, 65.09375, 65.15625, 65.21875, 65.28125, 65.34375, 65.40625, 65.46875, 65.53125, 65.59375, 65.65625, 65.71875, 65.78125, 65.84375, 65.90625, 65.96875, 66.03125, 66.09375, 66.15625, 66.21875, 66.28125, 66.34375, 66.40625, 66.46875, 66.53125, 66.59375, 66.65625, 66.71875, 66.78125, 66.84375, 66.90625, 66.96875, 67.03125, 67.09375, 67.15625, 67.21875, 67.28125, 67.34375, 67.40625, 67.46875, 67.53125, 67.59375, 67.65625, 67.71875, 67.78125, 67.84375, 67.90625, 67.96875, 68.03125, 68.09375, 68.15625, 68.21875, 68.28125, 68.34375, 68.40625, 68.46875, 68.53125, 68.59375, 68.65625, 68.71875, 68.78125, 68.84375, 68.90625, 68.96875, 69.03125, 69.09375, 69.15625, 69.21875, 69.28125, 69.34375, 69.40625, 69.46875, 69.53125, 69.59375, 69.65625, 69.71875, 69.78125, 69.84375, 69.90625, 69.96875, 70.03125, 70.09375, 70.15625, 70.21875, 70.28125, 70.34375, 70.40625, 70.46875, 70.53125, 70.59375, 70.65625, 70.71875, 70.78125, 70.84375, 70.90625, 70.96875, 71.03125, 71.09375, 71.15625, 71.21875, 71.28125, 71.34375, 71.40625, 71.46875, 71.53125, 71.59375, 71.65625, 71.71875, 71.78125, 71.84375, 71.90625, 71.96875 ] lon = [ -29.9375, -29.8125, -29.6875, -29.5625, -29.4375, -29.3125, -29.1875, -29.0625, -28.9375, -28.8125, -28.6875, -28.5625, -28.4375, -28.3125, -28.1875, -28.0625, -27.9375, -27.8125, -27.6875, -27.5625, -27.4375, -27.3125, -27.1875, -27.0625, -26.9375, -26.8125, -26.6875, -26.5625, -26.4375, -26.3125, -26.1875, -26.0625, -25.9375, -25.8125, -25.6875, -25.5625, -25.4375, -25.3125, -25.1875, -25.0625, -24.9375, -24.8125, -24.6875, -24.5625, -24.4375, -24.3125, -24.1875, -24.0625, -23.9375, -23.8125, -23.6875, -23.5625, -23.4375, -23.3125, -23.1875, -23.0625, -22.9375, -22.8125, -22.6875, -22.5625, -22.4375, -22.3125, -22.1875, -22.0625, -21.9375, -21.8125, -21.6875, -21.5625, -21.4375, -21.3125, -21.1875, -21.0625, -20.9375, -20.8125, -20.6875, -20.5625, -20.4375, -20.3125, -20.1875, -20.0625, -19.9375, -19.8125, -19.6875, -19.5625, -19.4375, -19.3125, -19.1875, -19.0625, -18.9375, -18.8125, -18.6875, -18.5625, -18.4375, -18.3125, -18.1875, -18.0625, -17.9375, -17.8125, -17.6875, -17.5625, -17.4375, -17.3125, -17.1875, -17.0625, -16.9375, -16.8125, -16.6875, -16.5625, -16.4375, -16.3125, -16.1875, -16.0625, -15.9375, -15.8125, -15.6875, -15.5625, -15.4375, -15.3125, -15.1875, -15.0625, -14.9375, -14.8125, -14.6875, -14.5625, -14.4375, -14.3125, -14.1875, -14.0625, -13.9375, -13.8125, -13.6875, -13.5625, -13.4375, -13.3125, -13.1875, -13.0625, -12.9375, -12.8125, -12.6875, -12.5625, -12.4375, -12.3125, -12.1875, -12.0625, -11.9375, -11.8125, -11.6875, -11.5625, -11.4375, -11.3125, -11.1875, -11.0625, -10.9375, -10.8125, -10.6875, -10.5625, -10.4375, -10.3125, -10.1875, -10.0625, -9.9375, -9.8125, -9.6875, -9.5625, -9.4375, -9.3125, -9.1875, -9.0625, -8.9375, -8.8125, -8.6875, -8.5625, -8.4375, -8.3125, -8.1875, -8.0625, -7.9375, -7.8125, -7.6875, -7.5625, -7.4375, -7.3125, -7.1875, -7.0625, -6.9375, -6.8125, -6.6875, -6.5625, -6.4375, -6.3125, -6.1875, -6.0625, -5.9375, -5.8125, -5.6875, -5.5625, -5.4375, -5.3125, -5.1875, -5.0625, -4.9375, -4.8125, -4.6875, -4.5625, -4.4375, -4.3125, -4.1875, -4.0625, -3.9375, -3.8125, -3.6875, -3.5625, -3.4375, -3.3125, -3.1875, -3.0625, -2.9375, -2.8125, -2.6875, -2.5625, -2.4375, -2.3125, -2.1875, -2.0625, -1.9375, -1.8125, -1.6875, -1.5625, -1.4375, -1.3125, -1.1875, -1.0625, -0.9375, -0.8125, -0.6875, -0.5625, -0.4375, -0.3125, -0.1875, -0.0625, 0.0625, 0.1875, 0.3125, 0.4375, 0.5625, 0.6875, 0.8125, 0.9375, 1.0625, 1.1875, 1.3125, 1.4375, 1.5625, 1.6875, 1.8125, 1.9375, 2.0625, 2.1875, 2.3125, 2.4375, 2.5625, 2.6875, 2.8125, 2.9375, 3.0625, 3.1875, 3.3125, 3.4375, 3.5625, 3.6875, 3.8125, 3.9375, 4.0625, 4.1875, 4.3125, 4.4375, 4.5625, 4.6875, 4.8125, 4.9375, 5.0625, 5.1875, 5.3125, 5.4375, 5.5625, 5.6875, 5.8125, 5.9375, 6.0625, 6.1875, 6.3125, 6.4375, 6.5625, 6.6875, 6.8125, 6.9375, 7.0625, 7.1875, 7.3125, 7.4375, 7.5625, 7.6875, 7.8125, 7.9375, 8.0625, 8.1875, 8.3125, 8.4375, 8.5625, 8.6875, 8.8125, 8.9375, 9.0625, 9.1875, 9.3125, 9.4375, 9.5625, 9.6875, 9.8125, 9.9375, 10.0625, 10.1875, 10.3125, 10.4375, 10.5625, 10.6875, 10.8125, 10.9375, 11.0625, 11.1875, 11.3125, 11.4375, 11.5625, 11.6875, 11.8125, 11.9375, 12.0625, 12.1875, 12.3125, 12.4375, 12.5625, 12.6875, 12.8125, 12.9375, 13.0625, 13.1875, 13.3125, 13.4375, 13.5625, 13.6875, 13.8125, 13.9375, 14.0625, 14.1875, 14.3125, 14.4375, 14.5625, 14.6875, 14.8125, 14.9375, 15.0625, 15.1875, 15.3125, 15.4375, 15.5625, 15.6875, 15.8125, 15.9375, 16.0625, 16.1875, 16.3125, 16.4375, 16.5625, 16.6875, 16.8125, 16.9375, 17.0625, 17.1875, 17.3125, 17.4375, 17.5625, 17.6875, 17.8125, 17.9375, 18.0625, 18.1875, 18.3125, 18.4375, 18.5625, 18.6875, 18.8125, 18.9375, 19.0625, 19.1875, 19.3125, 19.4375, 19.5625, 19.6875, 19.8125, 19.9375, 20.0625, 20.1875, 20.3125, 20.4375, 20.5625, 20.6875, 20.8125, 20.9375, 21.0625, 21.1875, 21.3125, 21.4375, 21.5625, 21.6875, 21.8125, 21.9375, 22.0625, 22.1875, 22.3125, 22.4375, 22.5625, 22.6875, 22.8125, 22.9375, 23.0625, 23.1875, 23.3125, 23.4375, 23.5625, 23.6875, 23.8125, 23.9375, 24.0625, 24.1875, 24.3125, 24.4375, 24.5625, 24.6875, 24.8125, 24.9375, 25.0625, 25.1875, 25.3125, 25.4375, 25.5625, 25.6875, 25.8125, 25.9375, 26.0625, 26.1875, 26.3125, 26.4375, 26.5625, 26.6875, 26.8125, 26.9375, 27.0625, 27.1875, 27.3125, 27.4375, 27.5625, 27.6875, 27.8125, 27.9375, 28.0625, 28.1875, 28.3125, 28.4375, 28.5625, 28.6875, 28.8125, 28.9375, 29.0625, 29.1875, 29.3125, 29.4375, 29.5625, 29.6875, 29.8125, 29.9375, 30.0625, 30.1875, 30.3125, 30.4375, 30.5625, 30.6875, 30.8125, 30.9375, 31.0625, 31.1875, 31.3125, 31.4375, 31.5625, 31.6875, 31.8125, 31.9375, 32.0625, 32.1875, 32.3125, 32.4375, 32.5625, 32.6875, 32.8125, 32.9375, 33.0625, 33.1875, 33.3125, 33.4375, 33.5625, 33.6875, 33.8125, 33.9375, 34.0625, 34.1875, 34.3125, 34.4375, 34.5625, 34.6875, 34.8125, 34.9375, 35.0625, 35.1875, 35.3125, 35.4375, 35.5625, 35.6875, 35.8125, 35.9375, 36.0625, 36.1875, 36.3125, 36.4375, 36.5625, 36.6875, 36.8125, 36.9375, 37.0625, 37.1875, 37.3125, 37.4375, 37.5625, 37.6875, 37.8125, 37.9375, 38.0625, 38.1875, 38.3125, 38.4375, 38.5625, 38.6875, 38.8125, 38.9375, 39.0625, 39.1875, 39.3125, 39.4375, 39.5625, 39.6875, 39.8125, 39.9375, 40.0625, 40.1875, 40.3125, 40.4375, 40.5625, 40.6875, 40.8125, 40.9375, 41.0625, 41.1875, 41.3125, 41.4375, 41.5625, 41.6875, 41.8125, 41.9375, 42.0625, 42.1875, 42.3125, 42.4375, 42.5625, 42.6875, 42.8125, 42.9375, 43.0625, 43.1875, 43.3125, 43.4375, 43.5625, 43.6875, 43.8125, 43.9375, 44.0625, 44.1875, 44.3125, 44.4375, 44.5625, 44.6875, 44.8125, 44.9375, 45.0625, 45.1875, 45.3125, 45.4375, 45.5625, 45.6875, 45.8125, 45.9375, 46.0625, 46.1875, 46.3125, 46.4375, 46.5625, 46.6875, 46.8125, 46.9375, 47.0625, 47.1875, 47.3125, 47.4375, 47.5625, 47.6875, 47.8125, 47.9375, 48.0625, 48.1875, 48.3125, 48.4375, 48.5625, 48.6875, 48.8125, 48.9375, 49.0625, 49.1875, 49.3125, 49.4375, 49.5625, 49.6875, 49.8125, 49.9375, 50.0625, 50.1875, 50.3125, 50.4375, 50.5625, 50.6875, 50.8125, 50.9375, 51.0625, 51.1875, 51.3125, 51.4375, 51.5625, 51.6875, 51.8125, 51.9375, 52.0625, 52.1875, 52.3125, 52.4375, 52.5625, 52.6875, 52.8125, 52.9375, 53.0625, 53.1875, 53.3125, 53.4375, 53.5625, 53.6875, 53.8125, 53.9375, 54.0625, 54.1875, 54.3125, 54.4375, 54.5625, 54.6875, 54.8125, 54.9375, 55.0625, 55.1875, 55.3125, 55.4375, 55.5625, 55.6875, 55.8125, 55.9375, 56.0625, 56.1875, 56.3125, 56.4375, 56.5625, 56.6875, 56.8125, 56.9375, 57.0625, 57.1875, 57.3125, 57.4375, 57.5625, 57.6875, 57.8125, 57.9375, 58.0625, 58.1875, 58.3125, 58.4375, 58.5625, 58.6875, 58.8125, 58.9375, 59.0625, 59.1875, 59.3125, 59.4375, 59.5625, 59.6875, 59.8125, 59.9375 ] emiss_ships = np.zeros(shape=(12, 672, 720)) ships[:] = emiss_ships dataset['ships'][:] = ships[:] dataset['lat'][:] = lat[:] dataset['lon'][:] = lon[:] ####iii)Append values to the variable time (12 months) #dates = [datetime(2000,01,01)+n*monthdelta(1) for n in range(nox_pow.shape[0])] dates = [add_months(datetime(2000, 01, 01), n) for n in range(12)]
def dealonefraction(fractionid): #print "deal fraction begin: %d"%fractionid #get basic information seq_num=0 step_num=0 seqname_maxlength=10 input_dim=len(wordid_dict) # 1 additional dim is for startflag feat_dim=1 output_dim=2 #additional tag ot represent the end tag if(fractionid>=0): start_senid=fractionid*prep_fraction_size end_senid=(fractionid+1)*prep_fraction_size else: start_senid=-1 end_senid=-1 fi=open(DATA_FILE,"r") ln=0 for line in fi: if(ln<start_senid and start_senid>=0): ln+=1 continue if(ln>=end_senid and end_senid>=0): break ln+=1 line=line.strip() if(line==""): continue senwords=line.split(" ") step_num+=len(senwords) seq_num+=1 fi.close() if(seq_num==0): print "fraction null: %d"%fractionid return #define netcdf file if(fractionid>=0): poutput_file=output_file+"."+str(fractionid) else: poutput_file=output_file nc=ds(poutput_file,"w",format="NETCDF4") nc.createDimension("seq_num",seq_num) nc.createDimension("step_num",step_num) nc.createDimension("feat_dim",feat_dim) nc.createDimension("output_dim",output_dim) nc.createDimension("seqname_maxlength",seqname_maxlength) ncvar_seqnames=nc.createVariable("seqTags","c",("seq_num","seqname_maxlength")) ncvar_seqlengths=nc.createVariable("seqLengths","i4",("seq_num")) ncvar_inputfeats=nc.createVariable("inputFeats","f4",("step_num","feat_dim")) ncvar_inputwords=nc.createVariable("inputWords","i4",("step_num")) ncvar_outputlabels=nc.createVariable("outputLabels","i4",("step_num")) frame_index=0 sen_index=0 fi=open(DATA_FILE,"r") ln=0 for line in fi: if(ln<start_senid and start_senid>=0): ln+=1 continue if(ln>=end_senid and end_senid>=0): break ln+=1 line=line.strip() if(line==""): continue senwords=line.split(" ") if(start_senid>=0): seqname="%010d"%(sen_index+start_senid) else: seqname="%010d"%sen_index sample_num=len(senwords) #all correct sentence for i in range(sample_num): wordid=getwordid(senwords[i]) ncvar_inputwords[frame_index]=wordid ncvar_inputfeats[frame_index,0]=0 ncvar_outputlabels[frame_index]=1 frame_index+=1 ncvar_seqnames[sen_index,0:seqname_maxlength]=seqname ncvar_seqlengths[sen_index]=sample_num sen_index+=1 #with replaced error sentence for i in range(sample_num): r=random.random() wordid=-1 tagid=-1 if(r<replacerate): #replace with a random word, tag is 0 wordid=random.randint(0,maxwordid) tagid=0 else: # original word, tag is 1 wordid=getwordid(senwords[i]) tagid=1 ncvar_inputwords[frame_index]=wordid ncvar_inputfeats[frame_index,0]=0 ncvar_outputlabels[frame_index]=tagid frame_index+=1 ncvar_seqnames[sen_index,0:seqname_maxlength]=seqname ncvar_seqlengths[sen_index]=sample_num sen_index+=1 nc.close() fi.close()
def dealonefraction(fractionid, WORD_DICT, prep_fraction_size, DATA_FILE, output_file, replacerate): #print "deal fraction begin: %d"%fractionid #random.seed(0) #load data wordid_dict = load_lowerworddict(WORD_DICT) maxwordid = len(wordid_dict) - 1 #get basic information seq_num = 0 step_num = 0 seqname_maxlength = 10 input_dim = len(wordid_dict) # 1 additional dim is for startflag feat_dim = 1 output_dim = 2 #additional tag ot represent the end tag if (fractionid >= 0): start_senid = fractionid * prep_fraction_size end_senid = (fractionid + 1) * prep_fraction_size else: start_senid = -1 end_senid = -1 fi = open(DATA_FILE, "r") ln = 0 for line in fi: if (ln < start_senid and start_senid >= 0): ln += 1 continue if (ln >= end_senid and end_senid >= 0): break ln += 1 line = line.strip() if (line == ""): continue senwords = line.split(" ") step_num += len(senwords) seq_num += 1 fi.close() if (seq_num == 0): print "fraction null: %d" % fractionid return #define netcdf file if (fractionid >= 0): poutput_file = output_file + "." + str(fractionid) else: poutput_file = output_file nc = ds(poutput_file, "w", format="NETCDF4") nc.createDimension("seq_num", seq_num) nc.createDimension("step_num", step_num) nc.createDimension("feat_dim", feat_dim) nc.createDimension("output_dim", output_dim) nc.createDimension("seqname_maxlength", seqname_maxlength) ncvar_seqnames = nc.createVariable("seqTags", "c", ("seq_num", "seqname_maxlength")) ncvar_seqlengths = nc.createVariable("seqLengths", "i4", ("seq_num")) ncvar_inputfeats = nc.createVariable("inputFeats", "f4", ("step_num", "feat_dim")) ncvar_inputwords = nc.createVariable("inputWords", "i4", ("step_num")) ncvar_outputlabels = nc.createVariable("outputLabels", "i4", ("step_num")) frame_index = 0 sen_index = 0 fi = open(DATA_FILE, "r") ln = 0 for line in fi: if (ln < start_senid and start_senid >= 0): ln += 1 continue if (ln >= end_senid and end_senid >= 0): break ln += 1 line = line.strip() if (line == ""): continue senwords = line.split(" ") if (start_senid >= 0): seqname = "%010d" % (sen_index + start_senid) else: seqname = "%010d" % sen_index sample_num = len(senwords) for i in range(sample_num): r = random.random() wordid = -1 tagid = -1 if (r < replacerate): #replace with a random word, tag is 0 , wordid = random.randint(0, maxwordid) tagid = 0 else: # original word, tag is 1 wordid = getwordid(senwords[i], wordid_dict) tagid = 1 ncvar_inputwords[frame_index] = wordid ncvar_inputfeats[frame_index, 0] = 0 ncvar_outputlabels[frame_index] = tagid frame_index += 1 ncvar_seqnames[sen_index, 0:seqname_maxlength] = seqname ncvar_seqlengths[sen_index] = sample_num sen_index += 1 nc.close() fi.close()
def main(): sentences = [] seq_num = 0 step_num = 0 seqname_maxlength = 10 output_dim = len(tagdict) subdirs = os.listdir(WSJDATA_DIR) for subdir in subdirs: #basename=subdir.lstrip("0") dataid = int(subdir) if (dataid < fromdataid): continue if (dataid > todataid and todataid > 0): continue dirpath = WSJDATA_DIR + "/" + subdir #print subdir fs = os.listdir(dirpath) for f in fs: fp = dirpath + "/" + f fi = open(fp, "r") seninfo = [] for line in fi: line = line.strip() if (line == ""): if (len(seninfo) > 0): sentences.append(seninfo) seninfo = [] continue toks = line.split("\t") word = toks[0] pos = toks[1] wordl = word.lower() if (word == wordl): lower_flag = 0 else: lower_flag = 1 # word contains capital letters wordid = worddict[wordl] tagid = tagdict[pos] seninfo.append((wordid, tagid, lower_flag)) step_num += 1 fi.close() seq_num = len(sentences) print "seq_num: %d" % seq_num print "step_num: %d" % step_num #define netcdf file nc = ds(OUTPUT_FILE, "w", format="NETCDF4") nc.createDimension("seq_num", seq_num) nc.createDimension("step_num", step_num) nc.createDimension("feat_dim", feat_dim) nc.createDimension("output_dim", output_dim) nc.createDimension("seqname_maxlength", seqname_maxlength) ncvar_seqnames = nc.createVariable("seqTags", "c", ("seq_num", "seqname_maxlength")) ncvar_seqlengths = nc.createVariable("seqLengths", "i4", ("seq_num")) ncvar_inputfeats = nc.createVariable("inputFeats", "f4", ("step_num", "feat_dim")) ncvar_inputwords = nc.createVariable("inputWords", "i4", ("step_num")) ncvar_outputlabels = nc.createVariable("outputLabels", "i4", ("step_num")) frame_index = 0 sen_index = 0 for senwords in sentences: seqname = "%010d" % sen_index sample_num = len(senwords) for i in range(sample_num): wordid = senwords[i][0] tagid = senwords[i][1] lower_flag = senwords[i][2] ncvar_inputwords[frame_index] = wordid ncvar_inputfeats[frame_index, 0] = lower_flag ncvar_outputlabels[frame_index] = tagid frame_index += 1 ncvar_seqnames[sen_index, 0:seqname_maxlength] = seqname ncvar_seqlengths[sen_index] = sample_num sen_index += 1 nc.close()
#!/usr/bin/python #coding=utf-8 from netCDF4 import Dataset as ds import numpy as np import sys if(len(sys.argv)<=1): f1="train.nc" else: f1=sys.argv[1] print "checking netCDF file: "+f1 nc1=ds(f1,'r',format='NETCDF4') print "\n-------dimension-------\n" for dim in nc1.dimensions: print nc1.dimensions[dim] print "\n-------variable metainfo-------\n" for var in nc1.variables: print var+":", print nc1.variables[var].ndim, #data dim print nc1.variables[var].shape #shape print "\n-------seq names-------\n" print nc1.variables["seqTags"][:] print "\n-------seq lengths-------\n" print nc1.variables["seqLengths"][:]
def main(): sentences=[] seq_num=0 step_num=0 assert feat_dim==4 seqname_maxlength=10 output_dim=len(tagdict) fi=open(INPUT_DATA) sen=[] for line in fi: line=line.strip() if(line==""): if(len(sen)>0): sentences.append(sen) sen=[] continue toks=line.split(" ") sen.append(toks) step_num+=1 seq_num=len(sentences) #print "step_num: %d"%step_num #print "seq_num: %d"%seq_num #define netcdf file nc=ds(OUTPUT_FILE,"w",format="NETCDF4") nc.createDimension("seq_num",seq_num) nc.createDimension("step_num",step_num) nc.createDimension("feat_dim",feat_dim) nc.createDimension("output_dim",output_dim) nc.createDimension("seqname_maxlength",seqname_maxlength) ncvar_seqnames=nc.createVariable("seqTags","c",("seq_num","seqname_maxlength")) ncvar_seqlengths=nc.createVariable("seqLengths","i4",("seq_num")) ncvar_inputfeats=nc.createVariable("inputFeats","f4",("step_num","feat_dim")) ncvar_inputwords=nc.createVariable("inputWords","i4",("step_num")) ncvar_outputlabels=nc.createVariable("outputLabels","i4",("step_num")) frame_index=0 sen_index=0 for seninfo in sentences: seqname="%010d"%sen_index sample_num=len(seninfo) for i in range(sample_num): word=seninfo[i][0] pos=seninfo[i][1] chunk=seninfo[i][2] tag=seninfo[i][4] wordl=word.lower() wordid=getwordid(wordl) tagid=tagdict[tag] allcaps=word.isupper() initcap=word[0].isupper() hascap=False if(not allcaps): for w in word: if(w.isupper()): hascap=True break else: hascap=True ncvar_inputwords[frame_index]=wordid ncvar_inputfeats[frame_index,:]=numpy.zeros(4) if(allcaps): ncvar_inputfeats[frame_index,0]=1 elif(initcap): ncvar_inputfeats[frame_index,1]=1 elif(hascap): ncvar_inputfeats[frame_index,2]=1 else: ncvar_inputfeats[frame_index,3]=1 ncvar_outputlabels[frame_index]=tagid frame_index+=1 ncvar_seqnames[sen_index,0:seqname_maxlength]=seqname ncvar_seqlengths[sen_index]=sample_num sen_index+=1 nc.close() print "wordcount: %d"%wordcount print "unkwordcount: %d"%unkwordcount print "oov rate: %f"%(float(unkwordcount)/wordcount)
def main(): #get basic information seqs_num=0 step_num=0 filename_maxlength=10 input_dim=len(wordid_dict) # 1 additional dim is for startflag output_dim=len(wordid_dict) #additional tag ot represent the end tag starttagid=len(wordid_dict) #the start tag id fi=open(DATA_FILE,"r") for line in fi: line=line.strip() if(line==""): continue senwords=line.split(" ") step_num+=len(senwords) seqs_num+=1 fi.close() #define netcdf file nc=ds(output_file,"w",format="NETCDF4") nc.createDimension("numSeqs",seqs_num) nc.createDimension("numTimesteps",step_num) nc.createDimension("inputPattSize",input_dim) nc.createDimension("maxSeqTagLength",filename_maxlength) nc.createDimension("numLabels",output_dim) ncvar_filenames=nc.createVariable("seqTags","c",("numSeqs","maxSeqTagLength")) ncvar_samplenums=nc.createVariable("seqLengths","i4",("numSeqs")) ncvar_inputs=nc.createVariable("inputs","i4",("numTimesteps")) ncvar_outputs=nc.createVariable("targetClasses","i4",("numTimesteps")) frame_index=0 sen_index=0 fi=open(DATA_FILE,"r") for line in fi: line=line.strip() if(line==""): continue senwords=line.split(" ") seqname="%010d"%sen_index sample_num=len(senwords) for i in range(sample_num): wordid=getwordid(senwords[i]) ncvar_inputs[frame_index]=wordid ncvar_outputs[frame_index]=wordid frame_index+=1 ncvar_filenames[sen_index,0:filename_maxlength]=seqname ncvar_samplenums[sen_index]=sample_num sen_index+=1 #if(sen_index%1000==0): # print sen_index #break nc.close() fi.close() print "worddict size: %d"%len(wordid_dict) print "total_wordnum: %d"%total_wordnum print "unk_wordnum: %d"%unk_wordnum print "oov rate: %f"%(float(unk_wordnum)/total_wordnum)
def main(): #get basic information seqs_num = 0 step_num = 0 filename_maxlength = 10 input_dim = len(wordid_dict) # 1 additional dim is for startflag output_dim = len(wordid_dict) #additional tag ot represent the end tag starttagid = len(wordid_dict) #the start tag id fi = open(DATA_FILE, "r") for line in fi: line = line.strip() if (line == ""): continue senwords = line.split(" ") step_num += len(senwords) seqs_num += 1 fi.close() #define netcdf file nc = ds(output_file, "w", format="NETCDF4") nc.createDimension("numSeqs", seqs_num) nc.createDimension("numTimesteps", step_num) nc.createDimension("inputPattSize", input_dim) nc.createDimension("maxSeqTagLength", filename_maxlength) nc.createDimension("numLabels", output_dim) ncvar_filenames = nc.createVariable("seqTags", "c", ("numSeqs", "maxSeqTagLength")) ncvar_samplenums = nc.createVariable("seqLengths", "i4", ("numSeqs")) ncvar_inputs = nc.createVariable("inputs", "i4", ("numTimesteps")) ncvar_outputs = nc.createVariable("targetClasses", "i4", ("numTimesteps")) frame_index = 0 sen_index = 0 fi = open(DATA_FILE, "r") for line in fi: line = line.strip() if (line == ""): continue senwords = line.split(" ") seqname = "%010d" % sen_index sample_num = len(senwords) for i in range(sample_num): wordid = getwordid(senwords[i]) ncvar_inputs[frame_index] = wordid ncvar_outputs[frame_index] = wordid frame_index += 1 ncvar_filenames[sen_index, 0:filename_maxlength] = seqname ncvar_samplenums[sen_index] = sample_num sen_index += 1 #if(sen_index%1000==0): # print sen_index #break nc.close() fi.close() print "worddict size: %d" % len(wordid_dict) print "total_wordnum: %d" % total_wordnum print "unk_wordnum: %d" % unk_wordnum print "oov rate: %f" % (float(unk_wordnum) / total_wordnum)
def main(): sentences=[] seq_num=0 step_num=0 seqname_maxlength=10 output_dim=len(tagdict) subdirs=os.listdir(WSJDATA_DIR) for subdir in subdirs: #basename=subdir.lstrip("0") dataid=int(subdir) if(dataid<fromdataid): continue if(dataid>todataid and todataid>0): continue dirpath=WSJDATA_DIR+"/"+subdir #print subdir fs=os.listdir(dirpath) for f in fs: fp=dirpath+"/"+f fi=open(fp,"r") seninfo=[] for line in fi: line=line.strip() if(line==""): if(len(seninfo)>0): sentences.append(seninfo) seninfo=[] continue toks=line.split("\t") word=toks[0] pos=toks[1] wordl=word.lower() if(word==wordl): lower_flag=0 else: lower_flag=1 # word contains capital letters wordid=worddict[wordl] tagid=tagdict[pos] seninfo.append((wordid,tagid,lower_flag)) step_num+=1 fi.close() seq_num=len(sentences) print "seq_num: %d"%seq_num print "step_num: %d"%step_num #define netcdf file nc=ds(OUTPUT_FILE,"w",format="NETCDF4") nc.createDimension("seq_num",seq_num) nc.createDimension("step_num",step_num) nc.createDimension("feat_dim",feat_dim) nc.createDimension("output_dim",output_dim) nc.createDimension("seqname_maxlength",seqname_maxlength) ncvar_seqnames=nc.createVariable("seqTags","c",("seq_num","seqname_maxlength")) ncvar_seqlengths=nc.createVariable("seqLengths","i4",("seq_num")) ncvar_inputfeats=nc.createVariable("inputFeats","f4",("step_num","feat_dim")) ncvar_inputwords=nc.createVariable("inputWords","i4",("step_num")) ncvar_outputlabels=nc.createVariable("outputLabels","i4",("step_num")) frame_index=0 sen_index=0 for senwords in sentences: seqname="%010d"%sen_index sample_num=len(senwords) for i in range(sample_num): wordid=senwords[i][0] tagid=senwords[i][1] lower_flag=senwords[i][2] ncvar_inputwords[frame_index]=wordid ncvar_inputfeats[frame_index,0]=lower_flag ncvar_outputlabels[frame_index]=tagid frame_index+=1 ncvar_seqnames[sen_index,0:seqname_maxlength]=seqname ncvar_seqlengths[sen_index]=sample_num sen_index+=1 nc.close()