Beispiel #1
0
def netcdf4_netcdf3_conv(chem):
    #############(D)Convert from netCDF4 to netCDF3_Classic  #######
    #input file
    dsin = ds("netcdf4/tno_ship_" + chem + ".nc")

    #output file
    dsout = ds("netcdf3/tno_ship_" + chem + ".nc",
               "w",
               format="NETCDF3_CLASSIC")

    #Copy dimensions
    for dname, the_dim in dsin.dimensions.iteritems():
        print dname, len(the_dim)
        dsout.createDimension(
            dname,
            len(the_dim) if not the_dim.isunlimited() else None)

    # Copy variables
    for v_name, varin in dsin.variables.iteritems():
        outVar = dsout.createVariable(v_name, varin.datatype, varin.dimensions)
        #  print varin.datatype

        # Copy variable attributes
        outVar.setncatts({k: varin.getncattr(k) for k in varin.ncattrs()})

        outVar[:] = varin[:]

    # close the output file
    dsout.close()

    # close the input file
    dsin.close()

    #Check netCDF version
    print dsout.file_format
Beispiel #2
0
def avgDataFilesGlobal(filedir, var, filetype, depth, unit_conv = -1, num_files = 10):
    results = glob('{0}/*{1}*'.format(filedir, filetype))
    arr_tot = 0
    for filename in results:
        nc_i = ds(filename, 'r+', format='NETCDF4')

        if depth == None:
            arr = nc_i[var][:]
        else:
            arr = nc_i[var][:][depth]

        if filetype == 'aijpc':
            area_arr = nc_i['axyp'][:]
        elif filetype == 'oijlpc':
            area_arr = nc_i['oxyp3'][:][depth]
        # Set area to zero in cells that have no value, excluding them from the average
        area_arr[arr.mask] = 0
        if np.where(area_arr == 0)[0].size != 0:
            print('MASK CHECK', np.where(area_arr == 0)[0].size) # If an array has a mask, check to make sure area array is masked as well.

        arr_tot = arr_tot + arr

    arr_avg = (arr_tot * unit_conv) / num_files
    # # Used primarily for planetary albedo, masking area wherever there's no value (i.e. no sunlight)
    # area_arr[np.where(arr_avg==0)] = 0
    # print(np.where(area_arr == 0)[0].size)
    # #
    if 'aqua' in filedir:
        arr_avg = np.roll(arr_avg, (arr_avg.shape[1]) // 2, axis=1)
        area_arr = np.roll(area_arr, (area_arr.shape[1]) // 2, axis=1)
        # Rolling the area so that masked values (i.e. for albedo) are rolled according to their coordinate
        # Rollling is necessary for determining side and substell averages

    return arr_avg, area_arr
def getPlanAlbFromSol(col, filetype = 'aijpc', num_files = 10):
    filedir = col['filedir']
    results = glob('{0}/*{1}*'.format(filedir, filetype))
    arr_tot = 0
    for filename in results:
        nc_i = ds(filename, 'r+', format='NETCDF4')
        net_i = nc_i['srnf_toa'][:]
        inc_i = nc_i['incsw_toa'][:]
        out_i = inc_i - net_i
        albedo_i = (out_i / inc_i) * 100
        arr_tot = arr_tot + albedo_i

        area_arr = nc_i['axyp'][:]
        area_arr[albedo_i.mask] = 0
        print(np.where(area_arr == 0)[0].size)
    arr_avg = arr_tot / num_files
    if 'aqua' in filedir:
        arr_avg = np.roll(arr_avg, (arr_avg.shape[1]) // 2, axis=1)
        area_arr = np.roll(area_arr, (area_arr.shape[1]) // 2, axis=1)
        # Rolling the area so that masked values (i.e. for albedo) are rolled according to their coordinate
        # Rollling is necessary for determining side and substell averages

    plot_row = {'var':'plan_alb_calc',
                'ylabel':'Calculated \n Planetary \n Albedo \n [%]',
                'title':'Calculated Planetary Albedo',
                'units':'[%]',
                'lat':lat,
                'lon':lon}
    title = col['title']
    return arr_avg, area_arr, plot_row, title
def avgDataFilesLatLon(filedir, var, num_files, filetype, unit_conv, depth,
                       avg_coord):
    results = glob('{0}/*{1}*'.format(filedir, filetype))
    arr_tot = np.zeros((46, 72))
    for filename in results:
        nc_i = ds(filename, 'r+', format='NETCDF4')

        if filetype == 'aijpc':
            area_arr = nc_i['axyp'][:]
        elif filetype == 'oijlpc':
            area_arr = nc_i['oxyp3'][:][depth]

        if depth == None:
            arr = nc_i[var][:]
        else:
            arr = nc_i[var][:][depth]

        arr_tot = arr_tot + arr

    arr_avg = (arr_tot * unit_conv) / num_files
    if len(arr_avg.shape) == 3:
        raise (
            ValueError,
            "This array is 3D, so the axes you are averaging over are invalid."
        )
    if 'aqua' in filedir:
        arr_avg = np.roll(arr_avg, (arr_avg.shape[1]) // 2, axis=1)
    if avg_coord == 'lat':
        avg_axis = 1
    elif avg_coord == 'lon':
        avg_axis = 0
    avg_arr = np.sum(arr_avg * area_arr, axis=avg_axis) / np.sum(area_arr,
                                                                 axis=avg_axis)
    return avg_arr
Beispiel #5
0
def iceGrowth(filedir, filename1, filename2):
    nc1 = ds(filedir + filename1, 'r+', format='NETCDF4')
    zsi1 = nc1['ZSI'][:]
    net_rad1 = nc1['net_rad_planet'][:]

    nc2 = ds(filedir + filename2, 'r+', format='NETCDF4')
    zsi2 = nc2['ZSI'][:]
    net_rad2 = nc2['net_rad_planet'][:]

    def getScale(arr1, arr2, div=True):
        arr1_max = np.max(np.abs(arr1))
        arr2_max = np.max(np.abs(arr2))
        tot_max = max(arr1_max, arr2_max)
        if div:
            tot_min = tot_max * -1
        else:
            arr1_min = np.min(np.abs(arr1))
            arr2_min = np.min(np.abs(arr2))
            tot_min = min(arr1_min, arr2_min)
        return tot_min, tot_max

    fig, axes = plt.subplots(2, 2)
    ax1 = axes[0, 0]
    ax1.set_title('Ice Thickness Growth [m]')
    zsi_min, zsi_max = getScale(zsi1, zsi2)

    im1 = ax1.imshow(zsi1, cmap='Blues', vmin=0, vmax=zsi_max)
    fig.colorbar(im1, ax=ax1)

    ax2 = axes[1, 0]
    im2 = ax2.imshow(zsi2, cmap='Blues', vmin=0, vmax=zsi_max)
    fig.colorbar(im2, ax=ax2)

    ax3 = axes[0, 1]
    ax3.set_title('Net Radiation [Wm$^{-2}$]')
    rad_min, rad_max = getScale(net_rad1, net_rad2)
    im3 = ax3.imshow(net_rad1, cmap='seismic', vmin=rad_min, vmax=rad_max)
    fig.colorbar(im3, ax=ax3)

    ax4 = axes[1, 1]
    im4 = ax4.imshow(net_rad2, cmap='seismic', vmin=rad_min, vmax=rad_max)
    fig.colorbar(im4, ax=ax4)

    plt.tight_layout()
    plt.show()
Beispiel #6
0
def openNC(fdir, fname):
    """
    Load the Dataset nc file.
    """
    filename = '/home/haynes13/code/python/input_files/' + fdir + '/' + fname
    nc = ds(filename, 'r+', format='NETCDF4') #ssc = substellar continent
    print(nc)
    print('Dataset Loaded Successfully.')
    return nc
Beispiel #7
0
def oceanPotTemp(filename):
    nc = ds(filename, 'r+', format='NETCDF4')
    pot_temp = nc['pot_temp'][:]
    avg_pot_temp = np.array([])
    for o_layer in pot_temp:
        avg_pot_temp = np.append(avg_pot_temp, np.mean(o_layer))
    print('Avg Pot Temp (Celsius)')
    print(avg_pot_temp)
    return
    def readdata(self):
        from netCDF4 import Dataset as ds
        from backend import varfuncs as vf
        fullpath=self.dataloc+self.datastr+'_0.50deg_reg_1950-1964_v14.0.nc'
        # add relavent varibles to structure
        self.lat=ds(fullpath,'r').variables['latitude'][:]
        self.longi=ds(fullpath,'r').variables['longitude'][:]
        (self.lat_index, lat_reduced) = vf.condi_ind(self.lat, self.lat_rng)          #reduce the data to geographic regions of relavence
        (self.longi_index, longi_reduced) = vf.condi_ind(self.longi, self.longi_rng)
        self.time=ds(fullpath,'r').variables['time']
        self.full=ds(fullpath,'r')
        
        
#        (begdateuse,fmt)=format_datestr(self.time.units) # get teh date of teh first data entry    
 #       dt = datetime.strptime(begdateuse, fmt).timetuple() 
        self.jdstart=vf.conv2jd(self.time.units) #convert the dates into jullian day 
        
        self.vari=ds(fullpath,'r').variables[self.datastr][:,self.lat_index,self.longi_index] #reduce teh main data matrix
def avgDataFiles(filedir, var, num_files=10):
    results = glob('{0}/*aijpc*'.format(filedir))
    arr_tot = np.zeros((46, 72))
    for filename in results:
        nc_i = ds(filename, 'r+', format='NETCDF4')
        arr = nc_i[var][:]
        arr_tot = arr_tot + arr
    arr_avg = arr_tot / num_files
    if 'aqua' in filedir:
        arr_avg = np.roll(arr_avg, (arr_avg.shape[1]) // 2, axis=1)
    return arr_avg
Beispiel #10
0
def avgDataFiles3D(filedir, var, num_files, filetype, unit_conv, depth):
    results = glob('{0}/*{1}*'.format(filedir, filetype))
    arr_tot = np.zeros((46, 72))
    for filename in results:
        nc_i = ds(filename, 'r+', format='NETCDF4')
        if depth == None:
            arr = nc_i[var][:]
        else:
            arr = nc_i[var][:][depth]
        arr_tot = arr_tot + arr
    arr_avg = (arr_tot * unit_conv) / num_files
    if 'aqua' in filedir:
        arr_avg = np.roll(arr_avg, (arr_avg.shape[1]) // 2, axis=1)
    return arr_avg
Beispiel #11
0
def avgDataFiles(filedir, filetype, var, unit_conv=1, num_files=10):
    results = glob('{0}/*{1}*'.format(filedir, filetype))
    arr_tot = 0
    for filename in results:
        nc_i = ds(filename, 'r+', format='NETCDF4')
        arr = nc_i[var][:]
        arr_tot = arr_tot + arr
    arr_avg = (arr_tot * unit_conv) / num_files

    if 'aqua' in filedir:  #if it's aquaplanet simulation you need to roll so that substell point is in middle
        arr_avg = np.roll(arr_avg, (arr_avg.shape[2]) // 2, axis=2)
    if 'o' in filetype:  #if it's ocean file, only take the top 5 levels
        arr_avg = arr_avg[:5, :, :]
    return arr_avg
def getHeightFile(filedir, filetype, num_files=10):
    results = glob('{0}/*{1}*'.format(filedir, filetype))
    z_tot = 0
    for filename in results:
        nc_i = ds(filename, 'r+', format='NETCDF4')
        z_i = nc_i['z'][:]
        z_tot = z_tot + z_i
    z_avg = z_tot / num_files

    if 'aqua' in filedir: #if it's aquaplanet simulation you need to roll so that substell point is in middle
        z_avg = np.roll(z_avg, (z_avg.shape[2]) // 2, axis=2)

    z_final = z_avg.reshape((z_avg.shape[0], -1)).mean(axis=1)
    return z_final
def main():
    sentences = []
    seq_num = 0
    step_num = 0

    assert feat_dim == 4

    seqname_maxlength = 10
    output_dim = len(tagdict)
    fi = open(INPUT_DATA)
    sen = []
    for line in fi:
        line = line.strip()
        if (line == ""):
            if (len(sen) > 0):
                sentences.append(sen)
            sen = []
            continue
        toks = line.split(" ")
        sen.append(toks)
        step_num += 1

    seq_num = len(sentences)
    #print "step_num: %d"%step_num
    #print "seq_num: %d"%seq_num
    #define netcdf file
    nc = ds(OUTPUT_FILE, "w", format="NETCDF4")
    nc.createDimension("seq_num", seq_num)
    nc.createDimension("step_num", step_num)
    nc.createDimension("feat_dim", feat_dim)
    nc.createDimension("output_dim", output_dim)
    nc.createDimension("seqname_maxlength", seqname_maxlength)

    ncvar_seqnames = nc.createVariable("seqTags", "c",
                                       ("seq_num", "seqname_maxlength"))
    ncvar_seqlengths = nc.createVariable("seqLengths", "i4", ("seq_num"))
    ncvar_inputfeats = nc.createVariable("inputFeats", "f4",
                                         ("step_num", "feat_dim"))
    ncvar_inputwords = nc.createVariable("inputWords", "i4", ("step_num"))
    ncvar_outputlabels = nc.createVariable("outputLabels", "i4", ("step_num"))

    frame_index = 0
    sen_index = 0
    for seninfo in sentences:
        seqname = "%010d" % sen_index
        sample_num = len(seninfo)
        for i in range(sample_num):
            word = seninfo[i][0]
            pos = seninfo[i][1]
            chunk = seninfo[i][2]
            tag = seninfo[i][4]

            wordl = word.lower()
            wordid = getwordid(wordl)
            tagid = tagdict[tag]

            allcaps = word.isupper()
            initcap = word[0].isupper()
            hascap = False
            if (not allcaps):
                for w in word:
                    if (w.isupper()):
                        hascap = True
                        break
            else:
                hascap = True

            ncvar_inputwords[frame_index] = wordid

            ncvar_inputfeats[frame_index, :] = numpy.zeros(4)
            if (allcaps):
                ncvar_inputfeats[frame_index, 0] = 1
            elif (initcap):
                ncvar_inputfeats[frame_index, 1] = 1
            elif (hascap):
                ncvar_inputfeats[frame_index, 2] = 1
            else:
                ncvar_inputfeats[frame_index, 3] = 1

            ncvar_outputlabels[frame_index] = tagid
            frame_index += 1

        ncvar_seqnames[sen_index, 0:seqname_maxlength] = seqname
        ncvar_seqlengths[sen_index] = sample_num
        sen_index += 1
    nc.close()

    print "wordcount: %d" % wordcount
    print "unkwordcount: %d" % unkwordcount
    print "oov rate: %f" % (float(unkwordcount) / wordcount)
Beispiel #14
0
#!/usr/bin/python
#coding=utf-8

from netCDF4 import Dataset as ds
import numpy as np
import sys

if (len(sys.argv) <= 1):
    f1 = "train.nc"
else:
    f1 = sys.argv[1]

print "checking netCDF file: " + f1

nc1 = ds(f1, 'r', format='NETCDF4')

print "\n-------dimension-------\n"
for dim in nc1.dimensions:
    print nc1.dimensions[dim]

print "\n-------variable metainfo-------\n"
for var in nc1.variables:
    print var + ":",
    print nc1.variables[var].ndim,  #data dim
    print nc1.variables[var].shape  #shape

print "\n-------seq names-------\n"
print nc1.variables["seqTags"][:]

print "\n-------seq lengths-------\n"
print nc1.variables["seqLengths"][:]
def main():
    sentences = []
    seq_num = 0
    step_num = 0
    filename_maxlength = 10
    input_dim = 1
    output_dim = len(tagdict)
    subdirs = os.listdir(WSJDATA_DIR)
    for subdir in subdirs:
        #basename=subdir.lstrip("0")
        dataid = int(subdir)
        if (dataid < fromdataid):
            continue
        if (dataid > todataid and todataid > 0):
            continue
        dirpath = WSJDATA_DIR + "/" + subdir
        #print subdir
        fs = os.listdir(dirpath)
        for f in fs:
            fp = dirpath + "/" + f
            fi = open(fp, "r")
            for line in fi:
                line = line.strip()
                if (line == ""):
                    continue
                toks = line.split(" ")
                seninfo = []
                for tok in toks:
                    ts = tok.split("/")
                    wordid = worddict[ts[0]]
                    tagid = tagdict[ts[1]]
                    seninfo.append((wordid, tagid))
                sentences.append(seninfo)
                step_num += len(toks)
            fi.close()
    seq_num = len(sentences)
    #define netcdf file
    nc = ds(OUTPUT_FILE, "w", format="NETCDF4")
    nc.createDimension("numSeqs", seq_num)
    nc.createDimension("numTimesteps", step_num)
    nc.createDimension("inputPattSize", input_dim)
    nc.createDimension("numLabels", output_dim)
    nc.createDimension("maxSeqTagLength", filename_maxlength)

    ncvar_filenames = nc.createVariable("seqTags", "c",
                                        ("numSeqs", "maxSeqTagLength"))
    ncvar_samplenums = nc.createVariable("seqLengths", "i4", ("numSeqs"))
    ncvar_inputs = nc.createVariable("inputs", "i4", ("numTimesteps"))
    ncvar_outputs = nc.createVariable("targetClasses", "i4", ("numTimesteps"))

    frame_index = 0
    sen_index = 0
    for senwords in sentences:
        seqname = "%010d" % sen_index
        sample_num = len(senwords)
        for i in range(sample_num):
            wordid = senwords[i][0]
            tagid = senwords[i][1]
            ncvar_inputs[frame_index] = wordid
            ncvar_outputs[frame_index] = tagid
            frame_index += 1

        ncvar_filenames[sen_index, 0:filename_maxlength] = seqname
        ncvar_samplenums[sen_index] = sample_num

        sen_index += 1
    nc.close()
def main():
	sentences=[]
	seq_num=0
	step_num=0
	filename_maxlength=10
	input_dim=1
	output_dim=len(tagdict)
	subdirs=os.listdir(WSJDATA_DIR)
	for subdir in subdirs:
		#basename=subdir.lstrip("0")
		dataid=int(subdir)
		if(dataid<fromdataid):
			continue
		if(dataid>todataid and todataid>0):
			continue
		dirpath=WSJDATA_DIR+"/"+subdir
		#print subdir
		fs=os.listdir(dirpath)
		for f in fs:
			fp=dirpath+"/"+f
			fi=open(fp,"r")
			for line in fi:
				line=line.strip()
				if(line==""):
					continue
				toks=line.split(" ")
				seninfo=[]
				for tok in toks:
					ts=tok.split("/")
					wordid=worddict[ts[0]]
					tagid=tagdict[ts[1]]
					seninfo.append((wordid,tagid))
				sentences.append(seninfo)
				step_num+=len(toks)
			fi.close()
	seq_num=len(sentences)
	#define netcdf file
	nc=ds(OUTPUT_FILE,"w",format="NETCDF4")
	nc.createDimension("numSeqs",seq_num)
	nc.createDimension("numTimesteps",step_num)
	nc.createDimension("inputPattSize",input_dim)
	nc.createDimension("numLabels",output_dim)
	nc.createDimension("maxSeqTagLength",filename_maxlength)
	
	ncvar_filenames=nc.createVariable("seqTags","c",("numSeqs","maxSeqTagLength"))
	ncvar_samplenums=nc.createVariable("seqLengths","i4",("numSeqs"))
	ncvar_inputs=nc.createVariable("inputs","i4",("numTimesteps"))
	ncvar_outputs=nc.createVariable("targetClasses","i4",("numTimesteps"))

	frame_index=0
	sen_index=0
	for senwords in sentences:
		seqname="%010d"%sen_index
		sample_num=len(senwords) 
		for i in range(sample_num):
			wordid=senwords[i][0]
			tagid=senwords[i][1]
			ncvar_inputs[frame_index]=wordid
			ncvar_outputs[frame_index]=tagid
			frame_index+=1

		ncvar_filenames[sen_index,0:filename_maxlength]=seqname
		ncvar_samplenums[sen_index]=sample_num

		sen_index+=1
	nc.close()
Beispiel #17
0
def main():
	sentences=[]
	seq_num=0
	step_num=0

	assert feat_dim==5

	seqname_maxlength=10
	output_dim=len(tagdict)
	fi=open(INPUT_DATA)
	sen=[]
	for line in fi:
		line=line.strip()
		if(line==""):
			if(len(sen)>0):
				sentences.append(sen)
			sen=[]
			continue
		toks=line.split(" ")
		sen.append(toks)
		step_num+=1

	seq_num=len(sentences)
	#print "step_num: %d"%step_num
	#print "seq_num: %d"%seq_num
	#define netcdf file
	nc=ds(OUTPUT_FILE,"w",format="NETCDF4")
	nc.createDimension("seq_num",seq_num)
	nc.createDimension("step_num",step_num)
	nc.createDimension("feat_dim",feat_dim)
	nc.createDimension("output_dim",output_dim)
	nc.createDimension("seqname_maxlength",seqname_maxlength)
	
	ncvar_seqnames=nc.createVariable("seqTags","c",("seq_num","seqname_maxlength"))
	ncvar_seqlengths=nc.createVariable("seqLengths","i4",("seq_num"))
	ncvar_inputfeats=nc.createVariable("inputFeats","f4",("step_num","feat_dim"))
	ncvar_inputwords=nc.createVariable("inputWords","i4",("step_num"))
	ncvar_outputlabels=nc.createVariable("outputLabels","i4",("step_num"))

	frame_index=0
	sen_index=0
	for seninfo in sentences:
		seqname="%010d"%sen_index
		sample_num=len(seninfo) 
		for i in range(sample_num):
			word=seninfo[i][0]
			pos=seninfo[i][1]
			chunk=seninfo[i][2]
			tag=seninfo[i][4]

			wordl=word.lower()
			wordid=getwordid(wordl)
			tagid=tagdict[tag]

			if(word==wordl):
				allislower=1
			else:
				allislower=0
			wordu=word.upper()
			if(word==wordu):
				allisupper=1
			else:
				allisupper=0
			w1=word[0]
			w1u=w1.upper()
			if(w1==w1u and w1.isalpha()):
				firstisupper=1
			else:
				firstisupper=0

			if("#" in word):
				hasnum=1
			else:
				hasnum=0

			if("-" in word):
				hashypen=1
			else:
				hashypen=0

			ncvar_inputwords[frame_index]=wordid
			ncvar_inputfeats[frame_index,0]=allislower
			ncvar_inputfeats[frame_index,1]=allisupper
			ncvar_inputfeats[frame_index,2]=firstisupper
			ncvar_inputfeats[frame_index,3]=hasnum
			ncvar_inputfeats[frame_index,4]=hashypen
			ncvar_outputlabels[frame_index]=tagid
			frame_index+=1

		ncvar_seqnames[sen_index,0:seqname_maxlength]=seqname
		ncvar_seqlengths[sen_index]=sample_num
		sen_index+=1
	nc.close()

	print "wordcount: %d"%wordcount
	print "unkwordcount: %d"%unkwordcount
	print "oov rate: %f"%(float(unkwordcount)/wordcount)
def	main():
	#get basic information
	seq_num=0
	step_num=0
	seqname_maxlength=10
	input_dim=len(wordid_dict) # 1 additional dim is for startflag
	feat_dim=1
	output_dim=2 #additional tag ot represent the end tag
	starttagid=len(wordid_dict) #the start tag id
	fi=open(DATA_FILE,"r")
	for line in fi:
		line=line.strip()
		if(line==""):
			continue
		senwords=line.split(" ")
		step_num+=len(senwords)*2
		seq_num+=2
	fi.close()
	#define netcdf file
	nc=ds(output_file,"w",format="NETCDF4")
	nc.createDimension("seq_num",seq_num)
	nc.createDimension("step_num",step_num)
	nc.createDimension("feat_dim",feat_dim)
	nc.createDimension("output_dim",output_dim)
	nc.createDimension("seqname_maxlength",seqname_maxlength)

	ncvar_seqnames=nc.createVariable("seqTags","c",("seq_num","seqname_maxlength"))
	ncvar_seqlengths=nc.createVariable("seqLengths","i4",("seq_num"))
	ncvar_inputfeats=nc.createVariable("inputFeats","f4",("step_num","feat_dim"))
	ncvar_inputwords=nc.createVariable("inputWords","i4",("step_num"))
	ncvar_outputlabels=nc.createVariable("outputLabels","i4",("step_num"))

	frame_index=0
	sen_index=0
	
	fi=open(DATA_FILE,"r")
	for line in fi:
		line=line.strip()
		if(line==""):
			continue
		senwords=line.split(" ")
		seqname="%010d"%sen_index
		sample_num=len(senwords)
		#all correct sentence
		for i in range(sample_num):
			wordid=getwordid(senwords[i])
			ncvar_inputwords[frame_index]=wordid
			ncvar_inputfeats[frame_index,0]=0
			ncvar_outputlabels[frame_index]=1
			frame_index+=1
		ncvar_seqnames[sen_index,0:seqname_maxlength]=seqname
		ncvar_seqlengths[sen_index]=sample_num
		sen_index+=1
		#with replaced error sentence
		for i in range(sample_num):
			r=random.random()
			wordid=-1
			tagid=-1
			if(r<replacerate): #replace with a random word, tag is 0
				wordid=random.randint(0,maxwordid)
				tagid=0
			else: # original word, tag is 1
				wordid=getwordid(senwords[i])
				tagid=1
			ncvar_inputwords[frame_index]=wordid
			ncvar_inputfeats[frame_index,0]=0
			ncvar_outputlabels[frame_index]=tagid
			frame_index+=1			
		ncvar_seqnames[sen_index,0:seqname_maxlength]=seqname
		ncvar_seqlengths[sen_index]=sample_num
		sen_index+=1

	nc.close()
	fi.close()

	print "worddict size: %d"%len(wordid_dict)
	print "total_wordnum: %d"%total_wordnum
	print "unk_wordnum: %d"%unk_wordnum
	print "oov rate: %f"%(float(unk_wordnum)/total_wordnum)
Beispiel #19
0
    for row in tsvreader:
        times.append(convert_time(row[0]))
        temps.append(convert_temp(row[1]))

base_time = times[0]
time_values = []

for t in times:
    value = t - base_time
    ts = value.total_seconds()
    time_values.append(ts)

time_units = "seconds since " + base_time.strftime('%Y-%m-%d %H:%M:%S')

dataset = ds(outfile, "w", format='NETCDF4_CLASSIC')

time_dim = dataset.createDimension("time", None)

time_var = dataset.createVariable("time", np.float64, ("time", ))
time_var[:] = time_values
time_var.units = time_units
time_var.standard_name = "time"
time_var.calendar = "standard"

temp = dataset.createVariable("temp", np.float32, ("time", ))
temp[:] = temps

temp.var_id = "temp"
temp.long_name = "Temperature of sensor (K)"
temp.units = "K"
Beispiel #20
0
def main():
	sentences=[]
	seq_num=0
	step_num=0
	seqname_maxlength=10
	output_dim=len(tagdict)
	suffix2_dim=len(suffix2dict)

	assert feat_dim==suffix2_dim+1

	subdirs=os.listdir(WSJDATA_DIR)
	for subdir in subdirs:
		#basename=subdir.lstrip("0")
		dataid=int(subdir)
		if(dataid<fromdataid):
			continue
		if(dataid>todataid and todataid>0):
			continue
		dirpath=WSJDATA_DIR+"/"+subdir
		#print subdir
		fs=os.listdir(dirpath)
		for f in fs:
			fp=dirpath+"/"+f
			fi=open(fp,"r")
			for line in fi:
				line=line.strip()
				if(line==""):
					continue
				toks=line.split(" ")
				seninfo=[]
				for tok in toks:
					ts=tok.split("/")
					word=ts[0]
					wordl=word.lower()
					if(word==wordl):
						lower_flag=0
					else:
						lower_flag=1 # word contains capital letters
					suffix2="none"
					if(len(wordl)>=2):
						suffix2=wordl[-2:]
					suffix2id=suffix2dict[suffix2]
					wordid=getwordid(wordl)
					tagid=tagdict[ts[1]]
					seninfo.append((wordid,tagid,lower_flag,suffix2id))
				sentences.append(seninfo)
				step_num+=len(toks)
			fi.close()
	seq_num=len(sentences)
	#define netcdf file
	nc=ds(OUTPUT_FILE,"w",format="NETCDF4")
	nc.createDimension("seq_num",seq_num)
	nc.createDimension("step_num",step_num)
	nc.createDimension("feat_dim",feat_dim)
	nc.createDimension("output_dim",output_dim)
	nc.createDimension("seqname_maxlength",seqname_maxlength)
	
	ncvar_seqnames=nc.createVariable("seqTags","c",("seq_num","seqname_maxlength"))
	ncvar_seqlengths=nc.createVariable("seqLengths","i4",("seq_num"))
	ncvar_inputfeats=nc.createVariable("inputFeats","f4",("step_num","feat_dim"))
	ncvar_inputwords=nc.createVariable("inputWords","i4",("step_num"))
	ncvar_outputlabels=nc.createVariable("outputLabels","i4",("step_num"))

	frame_index=0
	sen_index=0
	for senwords in sentences:
		seqname="%010d"%sen_index
		sample_num=len(senwords) 
		for i in range(sample_num):
			wordid=senwords[i][0]
			tagid=senwords[i][1]
			lower_flag=senwords[i][2]
			suffix2id=senwords[i][3]
			ncvar_inputwords[frame_index]=wordid
			ncvar_inputfeats[frame_index,:]=numpy.zeros(feat_dim)
			ncvar_inputfeats[frame_index,0]=lower_flag
			ncvar_inputfeats[frame_index,1+suffix2id]=1
			ncvar_outputlabels[frame_index]=tagid
			frame_index+=1

		ncvar_seqnames[sen_index,0:seqname_maxlength]=seqname
		ncvar_seqlengths[sen_index]=sample_num

		sen_index+=1
	nc.close()

	print "word num: %d"%wn
	print "unk word num: %d"%unk_wn
	print "oov rate: %f"%(float(unk_wn)/wn)
def main():
	sentences=[]
	seq_num=0
	step_num=0
	seqname_maxlength=10
	output_dim=len(tagdict)
	fi=open(INPUT_DATA)
	sen=[]
	for line in fi:
		line=line.strip()
		if(line==""):
			if(len(sen)>0):
				sentences.append(sen)
			sen=[]
			continue
		toks=line.split(" ")
		sen.append(toks)
		step_num+=1

	seq_num=len(sentences)
	#print "step_num: %d"%step_num
	#print "seq_num: %d"%seq_num
	#define netcdf file
	nc=ds(OUTPUT_FILE,"w",format="NETCDF4")
	nc.createDimension("seq_num",seq_num)
	nc.createDimension("step_num",step_num)
	nc.createDimension("feat_dim",feat_dim)
	nc.createDimension("output_dim",output_dim)
	nc.createDimension("seqname_maxlength",seqname_maxlength)
	
	ncvar_seqnames=nc.createVariable("seqTags","c",("seq_num","seqname_maxlength"))
	ncvar_seqlengths=nc.createVariable("seqLengths","i4",("seq_num"))
	ncvar_inputfeats=nc.createVariable("inputFeats","f4",("step_num","feat_dim"))
	ncvar_inputwords=nc.createVariable("inputWords","i4",("step_num"))
	ncvar_outputlabels=nc.createVariable("outputLabels","i4",("step_num"))

	frame_index=0
	sen_index=0
	for seninfo in sentences:
		seqname="%010d"%sen_index
		sample_num=len(seninfo) 
		for i in range(sample_num):
			word=seninfo[i][0]
			pos=seninfo[i][1]
			tag=seninfo[i][2]

			wordid=getwordid(word)
			if(tag=="I-LST"):
				tag="O"
			tagid=tagdict[tag]
			lower_flag=getlowerflag(word)

			ncvar_inputwords[frame_index]=wordid
			ncvar_inputfeats[frame_index,0]=lower_flag
			ncvar_outputlabels[frame_index]=tagid
			frame_index+=1

		ncvar_seqnames[sen_index,0:seqname_maxlength]=seqname
		ncvar_seqlengths[sen_index]=sample_num
		sen_index+=1
	nc.close()

	print "wordcount: %d"%wordcount
	print "unkwordcount: %d"%unkwordcount
	print "oov rate: %f"%(float(unkwordcount)/wordcount)
Beispiel #22
0
import xray
from monthdelta import monthdelta
from datetime import datetime
from netCDF4 import num2date, date2num
dates = []


############(A) Open original TNO emissions inventory################
#tno_2011 = 'path to the TNO_MACCII'  #input file
tno_2011 = '/mnt/raid/wrf-chem/emis/WRF_EMIS_UoM_UEA/preprocessor_script/TNO_MACC/TNO_MACC_III_emissions_2011.nc'
tno = nc.Dataset(tno_2011,'r')

#######################(B)Create new netcdf file#################################
#1) Create an empty NetCDF file 
#dataset = ds('Path_directory_to_place_the_file','w','r', format='NETCDF4_CLASSIC') #output file
dataset = ds('/mnt/raid/wrf-chem/emis/WRF_EMIS_UoM_UEA/preprocessor_script/OUT/CO_tno_out.nc','w','r', format='NETCDF4_CLASSIC') #output file

#2)Create the dimensions 

lat = dataset.createDimension('lat', 672) #number of latitudes
lon = dataset.createDimension('lon', 720) #number of longitudes
time = dataset.createDimension('time', None)

#3)Create variables
######1D variables first######

lat = dataset.createVariable('lat',np.float32, ('lat'), fill_value=False)
lon = dataset.createVariable('lon',np.float32, ('lon'), fill_value=False)
time = dataset.createVariable('time',np.float32, ('time'), fill_value=False)

######3D variables (SNAP sectors) ########
Beispiel #23
0
def create_new_netcdf_file(file):
    dataset = ds(file, 'w', 'r', format='NETCDF4_CLASSIC')

    #2)Create the dimensions
    lat = dataset.createDimension('lat', 672)  #number of latitudes
    lon = dataset.createDimension('lon', 720)  #number of longitudes
    time = dataset.createDimension('time', None)

    #3)Create variables
    ######1D variables first######

    lat = dataset.createVariable('lat', np.float32, ('lat'), fill_value=False)
    lon = dataset.createVariable('lon', np.float32, ('lon'), fill_value=False)
    time = dataset.createVariable('time',
                                  np.float32, ('time'),
                                  fill_value=False)

    ######3D variables (SNAP sectors) ########

    pow = dataset.createVariable('pow',
                                 np.float64, ('time', 'lat', 'lon'),
                                 fill_value=False)
    res = dataset.createVariable('res',
                                 np.float32, ('time', 'lat', 'lon'),
                                 fill_value=False)
    inc = dataset.createVariable('inc',
                                 np.float32, ('time', 'lat', 'lon'),
                                 fill_value=False)
    pei = dataset.createVariable('pei',
                                 np.float32, ('time', 'lat', 'lon'),
                                 fill_value=False)
    exf = dataset.createVariable('exf',
                                 np.float32, ('time', 'lat', 'lon'),
                                 fill_value=False)
    sol = dataset.createVariable('sol',
                                 np.float32, ('time', 'lat', 'lon'),
                                 fill_value=False)
    tra1 = dataset.createVariable('tra1',
                                  np.float32, ('time', 'lat', 'lon'),
                                  fill_value=False)
    tra2 = dataset.createVariable('tra2',
                                  np.float32, ('time', 'lat', 'lon'),
                                  fill_value=False)
    tra3 = dataset.createVariable('tra3',
                                  np.float32, ('time', 'lat', 'lon'),
                                  fill_value=False)
    tra4 = dataset.createVariable('tra4',
                                  np.float32, ('time', 'lat', 'lon'),
                                  fill_value=False)
    tra5 = dataset.createVariable('tra5',
                                  np.float32, ('time', 'lat', 'lon'),
                                  fill_value=False)
    nrt = dataset.createVariable('nrt',
                                 np.float32, ('time', 'lat', 'lon'),
                                 fill_value=False)
    was = dataset.createVariable('was',
                                 np.float32, ('time', 'lat', 'lon'),
                                 fill_value=False)
    agr = dataset.createVariable('agr',
                                 np.float32, ('time', 'lat', 'lon'),
                                 fill_value=False)

    #4) Add attributes units to 1D and 3D variables
    lat.units = 'degrees_north'
    lat.long_name = 'latitude'
    lon.units = 'degrees_east'
    lon.long_name = 'longitude'
    time.units = 'days since 1900-01-01 00:00'
    time.calendar = 'gregorian'
    time.long_name = 'Time'

    pow.units = 'Kg yr-1'
    pow.long_name = 'Power generation'
    res.units = 'Kg yr-1'
    res.long_name = 'Residential, comercial and other combustion'
    inc.units = 'Kg yr-1'
    inc.long_name = 'Industrial combustion'
    pei.units = 'Kg yr-1'
    pei.long_name = 'Processed emission industrial'
    exf.units = 'Kg yr-1'
    exf.long_name = 'Extraction and distribution of fossil fuels'
    sol.units = 'Kg yr-1'
    sol.long_name = 'Solvent use'
    tra1.units = 'Kg yr-1'
    tra1.long_name = 'Road transport, gasoline'
    tra2.units = 'Kg yr-1'
    tra2.long_name = 'Road transport, diesel'
    tra3.units = 'Kg yr-1'
    tra3.long_name = 'Road trasnport, LPG'
    tra4.units = 'Kg yr-1'
    tra4.long_name = 'Road trasnport, non-exhaust, volatilisation'
    tra5.units = 'Kg yr-1'
    tra5.long_name = 'Road transport, non-exhaust, wear'
    nrt.units = 'Kg yr-1'
    nrt.long_name = 'Non-road transport'
    was.units = 'Kg yr-1'
    was.long_name = 'Waste tratment and disposal'
    agr.units = 'Kg yr-1'
    agr.long_name = 'Agriculture'

    return dataset
Beispiel #24
0
def append_lat_lon_data(lat_tno, long_tno, file):
    file['lat'][:] = lat_tno
    file['lon'][:] = long_tno


# append to the data files
append_lat_lon_data(latitude_tno, longitude_tno, ds_bc1)
append_lat_lon_data(latitude_tno, longitude_tno, ds_ec25)
append_lat_lon_data(latitude_tno, longitude_tno, ds_ec10)
append_lat_lon_data(latitude_tno, longitude_tno, ds_oc25)
append_lat_lon_data(latitude_tno, longitude_tno, ds_oc10)
append_lat_lon_data(latitude_tno, longitude_tno, ds_pm25)
append_lat_lon_data(latitude_tno, longitude_tno, ds_pm10)

#2)Extract PM2_5 and PM10 using the NETCDF4 library########
tno_net = ds(tno_2011)
emis_cat_index = tno_net.variables['emission_category_index'][:]
lat_index = tno_net.variables['latitude_index'][:]
lon_index = tno_net.variables['longitude_index'][:]
pm25_data = tno_net.variables['pm2_5'][:]
pm10_data = tno_net.variables['pm10'][:]
### get the country ID info from NETCDF4 library
country_index = tno_net.variables['country_index'][:]
country_id = tno_net.variables['country_id'][:]

#3)Loop through every emission category (emiss_cat) and pick the emissions values by sector, latitude, and longitude
##### create a list of the sectors
## NOTE: this must be the same order as the sectors in the source file!!!!!!!

##### i) Create a 3D array with zeros
pm25_arrays = np.zeros(shape=(13, 12, 672, 720))
def main():
    #get basic information
    seq_num = 0
    step_num = 0
    seqname_maxlength = 10
    input_dim = len(wordid_dict)  # 1 additional dim is for startflag
    feat_dim = 1
    output_dim = 2  #additional tag ot represent the end tag
    starttagid = len(wordid_dict)  #the start tag id
    fi = open(DATA_FILE, "r")
    for line in fi:
        line = line.strip()
        if (line == ""):
            continue
        senwords = line.split(" ")
        step_num += len(senwords)
        seq_num += 1
    fi.close()
    #define netcdf file
    nc = ds(output_file, "w", format="NETCDF4")
    nc.createDimension("seq_num", seq_num)
    nc.createDimension("step_num", step_num)
    nc.createDimension("feat_dim", feat_dim)
    nc.createDimension("output_dim", output_dim)
    nc.createDimension("seqname_maxlength", seqname_maxlength)

    ncvar_seqnames = nc.createVariable("seqTags", "c",
                                       ("seq_num", "seqname_maxlength"))
    ncvar_seqlengths = nc.createVariable("seqLengths", "i4", ("seq_num"))
    ncvar_inputfeats = nc.createVariable("inputFeats", "f4",
                                         ("step_num", "feat_dim"))
    ncvar_inputwords = nc.createVariable("inputWords", "i4", ("step_num"))
    ncvar_outputlabels = nc.createVariable("outputLabels", "i4", ("step_num"))

    frame_index = 0
    sen_index = 0

    fi = open(DATA_FILE, "r")
    for line in fi:
        line = line.strip()
        if (line == ""):
            continue
        senwords = line.split(" ")
        seqname = "%010d" % sen_index
        sample_num = len(senwords)

        for i in range(sample_num):
            r = random.random()
            wordid = -1
            tagid = -1
            if (r < replacerate):  #replace with a random word, tag is 0
                wordid = random.randint(0, maxwordid)
                tagid = 0
            else:  # original word, tag is 1
                wordid = getwordid(senwords[i])
                tagid = 1

            ncvar_inputwords[frame_index] = wordid
            ncvar_inputfeats[frame_index, 0] = 0
            ncvar_outputlabels[frame_index] = tagid
            frame_index += 1

        ncvar_seqnames[sen_index, 0:seqname_maxlength] = seqname
        ncvar_seqlengths[sen_index] = sample_num

        sen_index += 1

    nc.close()
    fi.close()

    print "worddict size: %d" % len(wordid_dict)
    print "total_wordnum: %d" % total_wordnum
    print "unk_wordnum: %d" % unk_wordnum
    print "oov rate: %f" % (float(unk_wordnum) / total_wordnum)
Beispiel #26
0
import numpy as np
import xray
from monthdelta import monthdelta
from datetime import datetime
from netCDF4 import num2date, date2num

dates = []

############(A) Open original TNO emissions inventory################
tno_2011 = 'path to the TNO_MACCII'  #input file
tno = nc.Dataset(tno_2011, 'r')

#######################(B)Create new netcdf file#################################
#1) Create an empty NetCDF file
dataset = ds('Path_directory_to_place_the_file',
             'w',
             'r',
             format='NETCDF4_CLASSIC')  #output file

#2)Create the dimensions

lat = dataset.createDimension('lat', 672)  #number of latitudes
lon = dataset.createDimension('lon', 720)  #number of longitudes
time = dataset.createDimension('time', None)

#3)Create variables
######1D variables first######

lat = dataset.createVariable('lat', np.float32, ('lat'), fill_value=False)
lon = dataset.createVariable('lon', np.float32, ('lon'), fill_value=False)
time = dataset.createVariable('time', np.float32, ('time'), fill_value=False)
def main():
	sentences=[]
	seq_num=0
	step_num=0
	seqname_maxlength=10
	output_dim=len(tagdict)
	subdirs=os.listdir(WSJDATA_DIR)
	for subdir in subdirs:
		#basename=subdir.lstrip("0")
		dataid=int(subdir)
		if(dataid<fromdataid):
			continue
		if(dataid>todataid and todataid>0):
			continue
		dirpath=WSJDATA_DIR+"/"+subdir
		#print subdir
		fs=os.listdir(dirpath)
		for f in fs:
			fp=dirpath+"/"+f
			fi=open(fp,"r")
			for line in fi:
				line=line.strip()
				if(line==""):
					continue
				toks=line.split(" ")
				seninfo=[]
				for tok in toks:
					ts=tok.split("/")
					word=ts[0]
					tag=ts[1]
					seninfo.append((word,tag))
				sentences.append(seninfo)
				step_num+=len(toks)
			fi.close()
	seq_num=len(sentences)
	#define netcdf file
	nc=ds(OUTPUT_FILE,"w",format="NETCDF4")
	nc.createDimension("seq_num",seq_num)
	nc.createDimension("step_num",step_num)
	nc.createDimension("feat_dim",feat_dim)
	nc.createDimension("output_dim",output_dim)
	nc.createDimension("seqname_maxlength",seqname_maxlength)
	
	ncvar_seqnames=nc.createVariable("seqTags","c",("seq_num","seqname_maxlength"))
	ncvar_seqlengths=nc.createVariable("seqLengths","i4",("seq_num"))
	ncvar_inputfeats=nc.createVariable("inputFeats","f4",("step_num","feat_dim"))
	ncvar_inputwords=nc.createVariable("inputWords","i4",("step_num"))
	ncvar_outputlabels=nc.createVariable("outputLabels","i4",("step_num"))

	frame_index=0
	sen_index=0
	for senwords in sentences:
		seqname="%010d"%sen_index
		sample_num=len(senwords) 
		for i in range(sample_num):
			word=senwords[i][0]
			tag=senwords[i][1]
			
			wordl=word.lower()
			wordid=getwordid(wordl)
			tagid=tagdict[tag]

			if(word==wordl):
				allislower=1
			else:
				allislower=0
			wordu=word.upper()
			if(word==wordu):
				allisupper=1
			else:
				allisupper=0
			w1=word[0]
			w1u=w1.upper()
			if(w1==w1u and w1.isalpha()):
				firstisupper=1
			else:
				firstisupper=0
				
			ncvar_inputwords[frame_index]=wordid
			ncvar_inputfeats[frame_index,0]=allislower
			ncvar_inputfeats[frame_index,1]=allisupper
			ncvar_inputfeats[frame_index,2]=firstisupper
			ncvar_outputlabels[frame_index]=tagid
			frame_index+=1

		ncvar_seqnames[sen_index,0:seqname_maxlength]=seqname
		ncvar_seqlengths[sen_index]=sample_num

		sen_index+=1
	nc.close()

	print "word num: %d"%wn
	print "unk word num: %d"%unk_wn
	print "oov rate: %f"%(float(unk_wn)/wn)
def main():
    sentences = []
    seq_num = 0
    step_num = 0
    seqname_maxlength = 10
    output_dim = len(tagdict)
    fi = open(INPUT_DATA)
    sen = []
    for line in fi:
        line = line.strip()
        if (line == ""):
            if (len(sen) > 0):
                sentences.append(sen)
            sen = []
            continue
        toks = line.split(" ")
        sen.append(toks)
        step_num += 1

    seq_num = len(sentences)
    #print "step_num: %d"%step_num
    #print "seq_num: %d"%seq_num
    #define netcdf file
    nc = ds(OUTPUT_FILE, "w", format="NETCDF4")
    nc.createDimension("seq_num", seq_num)
    nc.createDimension("step_num", step_num)
    nc.createDimension("feat_dim", feat_dim)
    nc.createDimension("output_dim", output_dim)
    nc.createDimension("seqname_maxlength", seqname_maxlength)

    ncvar_seqnames = nc.createVariable("seqTags", "c",
                                       ("seq_num", "seqname_maxlength"))
    ncvar_seqlengths = nc.createVariable("seqLengths", "i4", ("seq_num"))
    ncvar_inputfeats = nc.createVariable("inputFeats", "f4",
                                         ("step_num", "feat_dim"))
    ncvar_inputwords = nc.createVariable("inputWords", "i4", ("step_num"))
    ncvar_outputlabels = nc.createVariable("outputLabels", "i4", ("step_num"))

    frame_index = 0
    sen_index = 0
    for seninfo in sentences:
        seqname = "%010d" % sen_index
        sample_num = len(seninfo)
        for i in range(sample_num):
            word = seninfo[i][0]
            pos = seninfo[i][1]
            tag = seninfo[i][2]

            wordid = getwordid(word)
            if (tag == "I-LST"):
                tag = "O"
            tagid = tagdict[tag]
            lower_flag = getlowerflag(word)

            ncvar_inputwords[frame_index] = wordid
            ncvar_inputfeats[frame_index, 0] = lower_flag
            ncvar_outputlabels[frame_index] = tagid
            frame_index += 1

        ncvar_seqnames[sen_index, 0:seqname_maxlength] = seqname
        ncvar_seqlengths[sen_index] = sample_num
        sen_index += 1
    nc.close()

    print "wordcount: %d" % wordcount
    print "unkwordcount: %d" % unkwordcount
    print "oov rate: %f" % (float(unkwordcount) / wordcount)
Beispiel #29
0
def create_ship_emissions(chem):
    #######################(B)Create new netcdf file#################################
    #1) Create an empty NetCDF file
    dataset = ds('netcdf4/tno_ship_' + chem + '.nc',
                 'w',
                 'r',
                 format='NETCDF4_CLASSIC')  #output file

    #2)Create the dimensions

    #lat = dataset.createDimension('lat', 436) #number of latitudes
    #lon = dataset.createDimension('lon', 1442) #number of longitudes
    lat = dataset.createDimension('lat', 672)  #number of latitudes
    lon = dataset.createDimension('lon', 720)  #number of longitudes
    time = dataset.createDimension('time', None)

    #3)Create variables
    ######1D variables first######

    lat = dataset.createVariable('lat', np.float32, ('lat'), fill_value=False)
    lon = dataset.createVariable('lon', np.float32, ('lon'), fill_value=False)
    time = dataset.createVariable('time',
                                  np.float32, ('time'),
                                  fill_value=False)

    ######3D variables (SNAP sectors) ########

    ships = dataset.createVariable('ships',
                                   np.float32, ('time', 'lat', 'lon'),
                                   fill_value=False)

    #4) Add attributes units to 1D and 3D variables
    lat.units = 'degrees_north'
    lat.long_name = 'latitude'
    lon.units = 'degrees_east'
    lon.long_name = 'longitude'
    time.units = 'days since 1900-01-01 00:00'
    time.calendar = 'gregorian'
    time.long_name = 'Time'

    ships.units = 'Tg?'
    ships.long_name = 'International shipping'

    # setting the latitude and longitude data
    #    we should change this to a more sensible calculation when we have time...
    lat = [
        30.03125, 30.09375, 30.15625, 30.21875, 30.28125, 30.34375, 30.40625,
        30.46875, 30.53125, 30.59375, 30.65625, 30.71875, 30.78125, 30.84375,
        30.90625, 30.96875, 31.03125, 31.09375, 31.15625, 31.21875, 31.28125,
        31.34375, 31.40625, 31.46875, 31.53125, 31.59375, 31.65625, 31.71875,
        31.78125, 31.84375, 31.90625, 31.96875, 32.03125, 32.09375, 32.15625,
        32.21875, 32.28125, 32.34375, 32.40625, 32.46875, 32.53125, 32.59375,
        32.65625, 32.71875, 32.78125, 32.84375, 32.90625, 32.96875, 33.03125,
        33.09375, 33.15625, 33.21875, 33.28125, 33.34375, 33.40625, 33.46875,
        33.53125, 33.59375, 33.65625, 33.71875, 33.78125, 33.84375, 33.90625,
        33.96875, 34.03125, 34.09375, 34.15625, 34.21875, 34.28125, 34.34375,
        34.40625, 34.46875, 34.53125, 34.59375, 34.65625, 34.71875, 34.78125,
        34.84375, 34.90625, 34.96875, 35.03125, 35.09375, 35.15625, 35.21875,
        35.28125, 35.34375, 35.40625, 35.46875, 35.53125, 35.59375, 35.65625,
        35.71875, 35.78125, 35.84375, 35.90625, 35.96875, 36.03125, 36.09375,
        36.15625, 36.21875, 36.28125, 36.34375, 36.40625, 36.46875, 36.53125,
        36.59375, 36.65625, 36.71875, 36.78125, 36.84375, 36.90625, 36.96875,
        37.03125, 37.09375, 37.15625, 37.21875, 37.28125, 37.34375, 37.40625,
        37.46875, 37.53125, 37.59375, 37.65625, 37.71875, 37.78125, 37.84375,
        37.90625, 37.96875, 38.03125, 38.09375, 38.15625, 38.21875, 38.28125,
        38.34375, 38.40625, 38.46875, 38.53125, 38.59375, 38.65625, 38.71875,
        38.78125, 38.84375, 38.90625, 38.96875, 39.03125, 39.09375, 39.15625,
        39.21875, 39.28125, 39.34375, 39.40625, 39.46875, 39.53125, 39.59375,
        39.65625, 39.71875, 39.78125, 39.84375, 39.90625, 39.96875, 40.03125,
        40.09375, 40.15625, 40.21875, 40.28125, 40.34375, 40.40625, 40.46875,
        40.53125, 40.59375, 40.65625, 40.71875, 40.78125, 40.84375, 40.90625,
        40.96875, 41.03125, 41.09375, 41.15625, 41.21875, 41.28125, 41.34375,
        41.40625, 41.46875, 41.53125, 41.59375, 41.65625, 41.71875, 41.78125,
        41.84375, 41.90625, 41.96875, 42.03125, 42.09375, 42.15625, 42.21875,
        42.28125, 42.34375, 42.40625, 42.46875, 42.53125, 42.59375, 42.65625,
        42.71875, 42.78125, 42.84375, 42.90625, 42.96875, 43.03125, 43.09375,
        43.15625, 43.21875, 43.28125, 43.34375, 43.40625, 43.46875, 43.53125,
        43.59375, 43.65625, 43.71875, 43.78125, 43.84375, 43.90625, 43.96875,
        44.03125, 44.09375, 44.15625, 44.21875, 44.28125, 44.34375, 44.40625,
        44.46875, 44.53125, 44.59375, 44.65625, 44.71875, 44.78125, 44.84375,
        44.90625, 44.96875, 45.03125, 45.09375, 45.15625, 45.21875, 45.28125,
        45.34375, 45.40625, 45.46875, 45.53125, 45.59375, 45.65625, 45.71875,
        45.78125, 45.84375, 45.90625, 45.96875, 46.03125, 46.09375, 46.15625,
        46.21875, 46.28125, 46.34375, 46.40625, 46.46875, 46.53125, 46.59375,
        46.65625, 46.71875, 46.78125, 46.84375, 46.90625, 46.96875, 47.03125,
        47.09375, 47.15625, 47.21875, 47.28125, 47.34375, 47.40625, 47.46875,
        47.53125, 47.59375, 47.65625, 47.71875, 47.78125, 47.84375, 47.90625,
        47.96875, 48.03125, 48.09375, 48.15625, 48.21875, 48.28125, 48.34375,
        48.40625, 48.46875, 48.53125, 48.59375, 48.65625, 48.71875, 48.78125,
        48.84375, 48.90625, 48.96875, 49.03125, 49.09375, 49.15625, 49.21875,
        49.28125, 49.34375, 49.40625, 49.46875, 49.53125, 49.59375, 49.65625,
        49.71875, 49.78125, 49.84375, 49.90625, 49.96875, 50.03125, 50.09375,
        50.15625, 50.21875, 50.28125, 50.34375, 50.40625, 50.46875, 50.53125,
        50.59375, 50.65625, 50.71875, 50.78125, 50.84375, 50.90625, 50.96875,
        51.03125, 51.09375, 51.15625, 51.21875, 51.28125, 51.34375, 51.40625,
        51.46875, 51.53125, 51.59375, 51.65625, 51.71875, 51.78125, 51.84375,
        51.90625, 51.96875, 52.03125, 52.09375, 52.15625, 52.21875, 52.28125,
        52.34375, 52.40625, 52.46875, 52.53125, 52.59375, 52.65625, 52.71875,
        52.78125, 52.84375, 52.90625, 52.96875, 53.03125, 53.09375, 53.15625,
        53.21875, 53.28125, 53.34375, 53.40625, 53.46875, 53.53125, 53.59375,
        53.65625, 53.71875, 53.78125, 53.84375, 53.90625, 53.96875, 54.03125,
        54.09375, 54.15625, 54.21875, 54.28125, 54.34375, 54.40625, 54.46875,
        54.53125, 54.59375, 54.65625, 54.71875, 54.78125, 54.84375, 54.90625,
        54.96875, 55.03125, 55.09375, 55.15625, 55.21875, 55.28125, 55.34375,
        55.40625, 55.46875, 55.53125, 55.59375, 55.65625, 55.71875, 55.78125,
        55.84375, 55.90625, 55.96875, 56.03125, 56.09375, 56.15625, 56.21875,
        56.28125, 56.34375, 56.40625, 56.46875, 56.53125, 56.59375, 56.65625,
        56.71875, 56.78125, 56.84375, 56.90625, 56.96875, 57.03125, 57.09375,
        57.15625, 57.21875, 57.28125, 57.34375, 57.40625, 57.46875, 57.53125,
        57.59375, 57.65625, 57.71875, 57.78125, 57.84375, 57.90625, 57.96875,
        58.03125, 58.09375, 58.15625, 58.21875, 58.28125, 58.34375, 58.40625,
        58.46875, 58.53125, 58.59375, 58.65625, 58.71875, 58.78125, 58.84375,
        58.90625, 58.96875, 59.03125, 59.09375, 59.15625, 59.21875, 59.28125,
        59.34375, 59.40625, 59.46875, 59.53125, 59.59375, 59.65625, 59.71875,
        59.78125, 59.84375, 59.90625, 59.96875, 60.03125, 60.09375, 60.15625,
        60.21875, 60.28125, 60.34375, 60.40625, 60.46875, 60.53125, 60.59375,
        60.65625, 60.71875, 60.78125, 60.84375, 60.90625, 60.96875, 61.03125,
        61.09375, 61.15625, 61.21875, 61.28125, 61.34375, 61.40625, 61.46875,
        61.53125, 61.59375, 61.65625, 61.71875, 61.78125, 61.84375, 61.90625,
        61.96875, 62.03125, 62.09375, 62.15625, 62.21875, 62.28125, 62.34375,
        62.40625, 62.46875, 62.53125, 62.59375, 62.65625, 62.71875, 62.78125,
        62.84375, 62.90625, 62.96875, 63.03125, 63.09375, 63.15625, 63.21875,
        63.28125, 63.34375, 63.40625, 63.46875, 63.53125, 63.59375, 63.65625,
        63.71875, 63.78125, 63.84375, 63.90625, 63.96875, 64.03125, 64.09375,
        64.15625, 64.21875, 64.28125, 64.34375, 64.40625, 64.46875, 64.53125,
        64.59375, 64.65625, 64.71875, 64.78125, 64.84375, 64.90625, 64.96875,
        65.03125, 65.09375, 65.15625, 65.21875, 65.28125, 65.34375, 65.40625,
        65.46875, 65.53125, 65.59375, 65.65625, 65.71875, 65.78125, 65.84375,
        65.90625, 65.96875, 66.03125, 66.09375, 66.15625, 66.21875, 66.28125,
        66.34375, 66.40625, 66.46875, 66.53125, 66.59375, 66.65625, 66.71875,
        66.78125, 66.84375, 66.90625, 66.96875, 67.03125, 67.09375, 67.15625,
        67.21875, 67.28125, 67.34375, 67.40625, 67.46875, 67.53125, 67.59375,
        67.65625, 67.71875, 67.78125, 67.84375, 67.90625, 67.96875, 68.03125,
        68.09375, 68.15625, 68.21875, 68.28125, 68.34375, 68.40625, 68.46875,
        68.53125, 68.59375, 68.65625, 68.71875, 68.78125, 68.84375, 68.90625,
        68.96875, 69.03125, 69.09375, 69.15625, 69.21875, 69.28125, 69.34375,
        69.40625, 69.46875, 69.53125, 69.59375, 69.65625, 69.71875, 69.78125,
        69.84375, 69.90625, 69.96875, 70.03125, 70.09375, 70.15625, 70.21875,
        70.28125, 70.34375, 70.40625, 70.46875, 70.53125, 70.59375, 70.65625,
        70.71875, 70.78125, 70.84375, 70.90625, 70.96875, 71.03125, 71.09375,
        71.15625, 71.21875, 71.28125, 71.34375, 71.40625, 71.46875, 71.53125,
        71.59375, 71.65625, 71.71875, 71.78125, 71.84375, 71.90625, 71.96875
    ]

    lon = [
        -29.9375, -29.8125, -29.6875, -29.5625, -29.4375, -29.3125, -29.1875,
        -29.0625, -28.9375, -28.8125, -28.6875, -28.5625, -28.4375, -28.3125,
        -28.1875, -28.0625, -27.9375, -27.8125, -27.6875, -27.5625, -27.4375,
        -27.3125, -27.1875, -27.0625, -26.9375, -26.8125, -26.6875, -26.5625,
        -26.4375, -26.3125, -26.1875, -26.0625, -25.9375, -25.8125, -25.6875,
        -25.5625, -25.4375, -25.3125, -25.1875, -25.0625, -24.9375, -24.8125,
        -24.6875, -24.5625, -24.4375, -24.3125, -24.1875, -24.0625, -23.9375,
        -23.8125, -23.6875, -23.5625, -23.4375, -23.3125, -23.1875, -23.0625,
        -22.9375, -22.8125, -22.6875, -22.5625, -22.4375, -22.3125, -22.1875,
        -22.0625, -21.9375, -21.8125, -21.6875, -21.5625, -21.4375, -21.3125,
        -21.1875, -21.0625, -20.9375, -20.8125, -20.6875, -20.5625, -20.4375,
        -20.3125, -20.1875, -20.0625, -19.9375, -19.8125, -19.6875, -19.5625,
        -19.4375, -19.3125, -19.1875, -19.0625, -18.9375, -18.8125, -18.6875,
        -18.5625, -18.4375, -18.3125, -18.1875, -18.0625, -17.9375, -17.8125,
        -17.6875, -17.5625, -17.4375, -17.3125, -17.1875, -17.0625, -16.9375,
        -16.8125, -16.6875, -16.5625, -16.4375, -16.3125, -16.1875, -16.0625,
        -15.9375, -15.8125, -15.6875, -15.5625, -15.4375, -15.3125, -15.1875,
        -15.0625, -14.9375, -14.8125, -14.6875, -14.5625, -14.4375, -14.3125,
        -14.1875, -14.0625, -13.9375, -13.8125, -13.6875, -13.5625, -13.4375,
        -13.3125, -13.1875, -13.0625, -12.9375, -12.8125, -12.6875, -12.5625,
        -12.4375, -12.3125, -12.1875, -12.0625, -11.9375, -11.8125, -11.6875,
        -11.5625, -11.4375, -11.3125, -11.1875, -11.0625, -10.9375, -10.8125,
        -10.6875, -10.5625, -10.4375, -10.3125, -10.1875, -10.0625, -9.9375,
        -9.8125, -9.6875, -9.5625, -9.4375, -9.3125, -9.1875, -9.0625, -8.9375,
        -8.8125, -8.6875, -8.5625, -8.4375, -8.3125, -8.1875, -8.0625, -7.9375,
        -7.8125, -7.6875, -7.5625, -7.4375, -7.3125, -7.1875, -7.0625, -6.9375,
        -6.8125, -6.6875, -6.5625, -6.4375, -6.3125, -6.1875, -6.0625, -5.9375,
        -5.8125, -5.6875, -5.5625, -5.4375, -5.3125, -5.1875, -5.0625, -4.9375,
        -4.8125, -4.6875, -4.5625, -4.4375, -4.3125, -4.1875, -4.0625, -3.9375,
        -3.8125, -3.6875, -3.5625, -3.4375, -3.3125, -3.1875, -3.0625, -2.9375,
        -2.8125, -2.6875, -2.5625, -2.4375, -2.3125, -2.1875, -2.0625, -1.9375,
        -1.8125, -1.6875, -1.5625, -1.4375, -1.3125, -1.1875, -1.0625, -0.9375,
        -0.8125, -0.6875, -0.5625, -0.4375, -0.3125, -0.1875, -0.0625, 0.0625,
        0.1875, 0.3125, 0.4375, 0.5625, 0.6875, 0.8125, 0.9375, 1.0625, 1.1875,
        1.3125, 1.4375, 1.5625, 1.6875, 1.8125, 1.9375, 2.0625, 2.1875, 2.3125,
        2.4375, 2.5625, 2.6875, 2.8125, 2.9375, 3.0625, 3.1875, 3.3125, 3.4375,
        3.5625, 3.6875, 3.8125, 3.9375, 4.0625, 4.1875, 4.3125, 4.4375, 4.5625,
        4.6875, 4.8125, 4.9375, 5.0625, 5.1875, 5.3125, 5.4375, 5.5625, 5.6875,
        5.8125, 5.9375, 6.0625, 6.1875, 6.3125, 6.4375, 6.5625, 6.6875, 6.8125,
        6.9375, 7.0625, 7.1875, 7.3125, 7.4375, 7.5625, 7.6875, 7.8125, 7.9375,
        8.0625, 8.1875, 8.3125, 8.4375, 8.5625, 8.6875, 8.8125, 8.9375, 9.0625,
        9.1875, 9.3125, 9.4375, 9.5625, 9.6875, 9.8125, 9.9375, 10.0625,
        10.1875, 10.3125, 10.4375, 10.5625, 10.6875, 10.8125, 10.9375, 11.0625,
        11.1875, 11.3125, 11.4375, 11.5625, 11.6875, 11.8125, 11.9375, 12.0625,
        12.1875, 12.3125, 12.4375, 12.5625, 12.6875, 12.8125, 12.9375, 13.0625,
        13.1875, 13.3125, 13.4375, 13.5625, 13.6875, 13.8125, 13.9375, 14.0625,
        14.1875, 14.3125, 14.4375, 14.5625, 14.6875, 14.8125, 14.9375, 15.0625,
        15.1875, 15.3125, 15.4375, 15.5625, 15.6875, 15.8125, 15.9375, 16.0625,
        16.1875, 16.3125, 16.4375, 16.5625, 16.6875, 16.8125, 16.9375, 17.0625,
        17.1875, 17.3125, 17.4375, 17.5625, 17.6875, 17.8125, 17.9375, 18.0625,
        18.1875, 18.3125, 18.4375, 18.5625, 18.6875, 18.8125, 18.9375, 19.0625,
        19.1875, 19.3125, 19.4375, 19.5625, 19.6875, 19.8125, 19.9375, 20.0625,
        20.1875, 20.3125, 20.4375, 20.5625, 20.6875, 20.8125, 20.9375, 21.0625,
        21.1875, 21.3125, 21.4375, 21.5625, 21.6875, 21.8125, 21.9375, 22.0625,
        22.1875, 22.3125, 22.4375, 22.5625, 22.6875, 22.8125, 22.9375, 23.0625,
        23.1875, 23.3125, 23.4375, 23.5625, 23.6875, 23.8125, 23.9375, 24.0625,
        24.1875, 24.3125, 24.4375, 24.5625, 24.6875, 24.8125, 24.9375, 25.0625,
        25.1875, 25.3125, 25.4375, 25.5625, 25.6875, 25.8125, 25.9375, 26.0625,
        26.1875, 26.3125, 26.4375, 26.5625, 26.6875, 26.8125, 26.9375, 27.0625,
        27.1875, 27.3125, 27.4375, 27.5625, 27.6875, 27.8125, 27.9375, 28.0625,
        28.1875, 28.3125, 28.4375, 28.5625, 28.6875, 28.8125, 28.9375, 29.0625,
        29.1875, 29.3125, 29.4375, 29.5625, 29.6875, 29.8125, 29.9375, 30.0625,
        30.1875, 30.3125, 30.4375, 30.5625, 30.6875, 30.8125, 30.9375, 31.0625,
        31.1875, 31.3125, 31.4375, 31.5625, 31.6875, 31.8125, 31.9375, 32.0625,
        32.1875, 32.3125, 32.4375, 32.5625, 32.6875, 32.8125, 32.9375, 33.0625,
        33.1875, 33.3125, 33.4375, 33.5625, 33.6875, 33.8125, 33.9375, 34.0625,
        34.1875, 34.3125, 34.4375, 34.5625, 34.6875, 34.8125, 34.9375, 35.0625,
        35.1875, 35.3125, 35.4375, 35.5625, 35.6875, 35.8125, 35.9375, 36.0625,
        36.1875, 36.3125, 36.4375, 36.5625, 36.6875, 36.8125, 36.9375, 37.0625,
        37.1875, 37.3125, 37.4375, 37.5625, 37.6875, 37.8125, 37.9375, 38.0625,
        38.1875, 38.3125, 38.4375, 38.5625, 38.6875, 38.8125, 38.9375, 39.0625,
        39.1875, 39.3125, 39.4375, 39.5625, 39.6875, 39.8125, 39.9375, 40.0625,
        40.1875, 40.3125, 40.4375, 40.5625, 40.6875, 40.8125, 40.9375, 41.0625,
        41.1875, 41.3125, 41.4375, 41.5625, 41.6875, 41.8125, 41.9375, 42.0625,
        42.1875, 42.3125, 42.4375, 42.5625, 42.6875, 42.8125, 42.9375, 43.0625,
        43.1875, 43.3125, 43.4375, 43.5625, 43.6875, 43.8125, 43.9375, 44.0625,
        44.1875, 44.3125, 44.4375, 44.5625, 44.6875, 44.8125, 44.9375, 45.0625,
        45.1875, 45.3125, 45.4375, 45.5625, 45.6875, 45.8125, 45.9375, 46.0625,
        46.1875, 46.3125, 46.4375, 46.5625, 46.6875, 46.8125, 46.9375, 47.0625,
        47.1875, 47.3125, 47.4375, 47.5625, 47.6875, 47.8125, 47.9375, 48.0625,
        48.1875, 48.3125, 48.4375, 48.5625, 48.6875, 48.8125, 48.9375, 49.0625,
        49.1875, 49.3125, 49.4375, 49.5625, 49.6875, 49.8125, 49.9375, 50.0625,
        50.1875, 50.3125, 50.4375, 50.5625, 50.6875, 50.8125, 50.9375, 51.0625,
        51.1875, 51.3125, 51.4375, 51.5625, 51.6875, 51.8125, 51.9375, 52.0625,
        52.1875, 52.3125, 52.4375, 52.5625, 52.6875, 52.8125, 52.9375, 53.0625,
        53.1875, 53.3125, 53.4375, 53.5625, 53.6875, 53.8125, 53.9375, 54.0625,
        54.1875, 54.3125, 54.4375, 54.5625, 54.6875, 54.8125, 54.9375, 55.0625,
        55.1875, 55.3125, 55.4375, 55.5625, 55.6875, 55.8125, 55.9375, 56.0625,
        56.1875, 56.3125, 56.4375, 56.5625, 56.6875, 56.8125, 56.9375, 57.0625,
        57.1875, 57.3125, 57.4375, 57.5625, 57.6875, 57.8125, 57.9375, 58.0625,
        58.1875, 58.3125, 58.4375, 58.5625, 58.6875, 58.8125, 58.9375, 59.0625,
        59.1875, 59.3125, 59.4375, 59.5625, 59.6875, 59.8125, 59.9375
    ]

    emiss_ships = np.zeros(shape=(12, 672, 720))

    ships[:] = emiss_ships

    dataset['ships'][:] = ships[:]
    dataset['lat'][:] = lat[:]
    dataset['lon'][:] = lon[:]

    ####iii)Append values to the variable time (12 months)
    #dates = [datetime(2000,01,01)+n*monthdelta(1) for n in range(nox_pow.shape[0])]
    dates = [add_months(datetime(2000, 01, 01), n) for n in range(12)]
def	dealonefraction(fractionid):
	#print "deal fraction begin: %d"%fractionid
	#get basic information
	seq_num=0
	step_num=0
	seqname_maxlength=10
	input_dim=len(wordid_dict) # 1 additional dim is for startflag
	feat_dim=1
	output_dim=2 #additional tag ot represent the end tag
	
	if(fractionid>=0):
		start_senid=fractionid*prep_fraction_size
		end_senid=(fractionid+1)*prep_fraction_size
	else:
		start_senid=-1
		end_senid=-1

	fi=open(DATA_FILE,"r")
	ln=0
	for line in fi:
		if(ln<start_senid and start_senid>=0):
			ln+=1
			continue
		if(ln>=end_senid and end_senid>=0):
			break
		ln+=1
		line=line.strip()
		if(line==""):
			continue
		senwords=line.split(" ")
		step_num+=len(senwords)
		seq_num+=1
	fi.close()
	if(seq_num==0):
		print "fraction null: %d"%fractionid
		return
	#define netcdf file
	if(fractionid>=0):
		poutput_file=output_file+"."+str(fractionid)
	else:
		poutput_file=output_file
	nc=ds(poutput_file,"w",format="NETCDF4")
	nc.createDimension("seq_num",seq_num)
	nc.createDimension("step_num",step_num)
	nc.createDimension("feat_dim",feat_dim)
	nc.createDimension("output_dim",output_dim)
	nc.createDimension("seqname_maxlength",seqname_maxlength)

	ncvar_seqnames=nc.createVariable("seqTags","c",("seq_num","seqname_maxlength"))
	ncvar_seqlengths=nc.createVariable("seqLengths","i4",("seq_num"))
	ncvar_inputfeats=nc.createVariable("inputFeats","f4",("step_num","feat_dim"))
	ncvar_inputwords=nc.createVariable("inputWords","i4",("step_num"))
	ncvar_outputlabels=nc.createVariable("outputLabels","i4",("step_num"))

	frame_index=0
	sen_index=0

	fi=open(DATA_FILE,"r")
	ln=0
	for line in fi:
		if(ln<start_senid and start_senid>=0):
			ln+=1
			continue
		if(ln>=end_senid and end_senid>=0):
			break
		ln+=1
		line=line.strip()
		if(line==""):
			continue
		senwords=line.split(" ")
		if(start_senid>=0):
			seqname="%010d"%(sen_index+start_senid)
		else:
			seqname="%010d"%sen_index
		sample_num=len(senwords)
		#all correct sentence
		for i in range(sample_num):
			wordid=getwordid(senwords[i])
			ncvar_inputwords[frame_index]=wordid
			ncvar_inputfeats[frame_index,0]=0
			ncvar_outputlabels[frame_index]=1
			frame_index+=1
		ncvar_seqnames[sen_index,0:seqname_maxlength]=seqname
		ncvar_seqlengths[sen_index]=sample_num
		sen_index+=1
		#with replaced error sentence
		for i in range(sample_num):
			r=random.random()
			wordid=-1
			tagid=-1
			if(r<replacerate): #replace with a random word, tag is 0
				wordid=random.randint(0,maxwordid)
				tagid=0
			else: # original word, tag is 1
				wordid=getwordid(senwords[i])
				tagid=1
			ncvar_inputwords[frame_index]=wordid
			ncvar_inputfeats[frame_index,0]=0
			ncvar_outputlabels[frame_index]=tagid
			frame_index+=1			
		ncvar_seqnames[sen_index,0:seqname_maxlength]=seqname
		ncvar_seqlengths[sen_index]=sample_num
		sen_index+=1

	nc.close()
	fi.close()
def dealonefraction(fractionid, WORD_DICT, prep_fraction_size, DATA_FILE,
                    output_file, replacerate):
    #print "deal fraction begin: %d"%fractionid
    #random.seed(0)
    #load data
    wordid_dict = load_lowerworddict(WORD_DICT)
    maxwordid = len(wordid_dict) - 1
    #get basic information
    seq_num = 0
    step_num = 0
    seqname_maxlength = 10
    input_dim = len(wordid_dict)  # 1 additional dim is for startflag
    feat_dim = 1
    output_dim = 2  #additional tag ot represent the end tag

    if (fractionid >= 0):
        start_senid = fractionid * prep_fraction_size
        end_senid = (fractionid + 1) * prep_fraction_size
    else:
        start_senid = -1
        end_senid = -1

    fi = open(DATA_FILE, "r")
    ln = 0
    for line in fi:
        if (ln < start_senid and start_senid >= 0):
            ln += 1
            continue
        if (ln >= end_senid and end_senid >= 0):
            break
        ln += 1
        line = line.strip()
        if (line == ""):
            continue
        senwords = line.split(" ")
        step_num += len(senwords)
        seq_num += 1
    fi.close()
    if (seq_num == 0):
        print "fraction null: %d" % fractionid
        return
    #define netcdf file
    if (fractionid >= 0):
        poutput_file = output_file + "." + str(fractionid)
    else:
        poutput_file = output_file
    nc = ds(poutput_file, "w", format="NETCDF4")
    nc.createDimension("seq_num", seq_num)
    nc.createDimension("step_num", step_num)
    nc.createDimension("feat_dim", feat_dim)
    nc.createDimension("output_dim", output_dim)
    nc.createDimension("seqname_maxlength", seqname_maxlength)

    ncvar_seqnames = nc.createVariable("seqTags", "c",
                                       ("seq_num", "seqname_maxlength"))
    ncvar_seqlengths = nc.createVariable("seqLengths", "i4", ("seq_num"))
    ncvar_inputfeats = nc.createVariable("inputFeats", "f4",
                                         ("step_num", "feat_dim"))
    ncvar_inputwords = nc.createVariable("inputWords", "i4", ("step_num"))
    ncvar_outputlabels = nc.createVariable("outputLabels", "i4", ("step_num"))

    frame_index = 0
    sen_index = 0

    fi = open(DATA_FILE, "r")
    ln = 0
    for line in fi:
        if (ln < start_senid and start_senid >= 0):
            ln += 1
            continue
        if (ln >= end_senid and end_senid >= 0):
            break
        ln += 1
        line = line.strip()
        if (line == ""):
            continue
        senwords = line.split(" ")
        if (start_senid >= 0):
            seqname = "%010d" % (sen_index + start_senid)
        else:
            seqname = "%010d" % sen_index
        sample_num = len(senwords)

        for i in range(sample_num):
            r = random.random()
            wordid = -1
            tagid = -1
            if (r < replacerate):  #replace with a random word, tag is 0 ,
                wordid = random.randint(0, maxwordid)
                tagid = 0
            else:  # original word, tag is 1
                wordid = getwordid(senwords[i], wordid_dict)
                tagid = 1

            ncvar_inputwords[frame_index] = wordid
            ncvar_inputfeats[frame_index, 0] = 0
            ncvar_outputlabels[frame_index] = tagid
            frame_index += 1

        ncvar_seqnames[sen_index, 0:seqname_maxlength] = seqname
        ncvar_seqlengths[sen_index] = sample_num

        sen_index += 1

    nc.close()
    fi.close()
Beispiel #32
0
def main():
    sentences = []
    seq_num = 0
    step_num = 0
    seqname_maxlength = 10
    output_dim = len(tagdict)
    subdirs = os.listdir(WSJDATA_DIR)
    for subdir in subdirs:
        #basename=subdir.lstrip("0")
        dataid = int(subdir)
        if (dataid < fromdataid):
            continue
        if (dataid > todataid and todataid > 0):
            continue
        dirpath = WSJDATA_DIR + "/" + subdir
        #print subdir
        fs = os.listdir(dirpath)
        for f in fs:
            fp = dirpath + "/" + f
            fi = open(fp, "r")
            seninfo = []
            for line in fi:
                line = line.strip()
                if (line == ""):
                    if (len(seninfo) > 0):
                        sentences.append(seninfo)
                    seninfo = []
                    continue
                toks = line.split("\t")
                word = toks[0]
                pos = toks[1]
                wordl = word.lower()
                if (word == wordl):
                    lower_flag = 0
                else:
                    lower_flag = 1  # word contains capital letters
                wordid = worddict[wordl]
                tagid = tagdict[pos]
                seninfo.append((wordid, tagid, lower_flag))
                step_num += 1
            fi.close()
    seq_num = len(sentences)

    print "seq_num: %d" % seq_num
    print "step_num: %d" % step_num

    #define netcdf file
    nc = ds(OUTPUT_FILE, "w", format="NETCDF4")
    nc.createDimension("seq_num", seq_num)
    nc.createDimension("step_num", step_num)
    nc.createDimension("feat_dim", feat_dim)
    nc.createDimension("output_dim", output_dim)
    nc.createDimension("seqname_maxlength", seqname_maxlength)

    ncvar_seqnames = nc.createVariable("seqTags", "c",
                                       ("seq_num", "seqname_maxlength"))
    ncvar_seqlengths = nc.createVariable("seqLengths", "i4", ("seq_num"))
    ncvar_inputfeats = nc.createVariable("inputFeats", "f4",
                                         ("step_num", "feat_dim"))
    ncvar_inputwords = nc.createVariable("inputWords", "i4", ("step_num"))
    ncvar_outputlabels = nc.createVariable("outputLabels", "i4", ("step_num"))

    frame_index = 0
    sen_index = 0
    for senwords in sentences:
        seqname = "%010d" % sen_index
        sample_num = len(senwords)
        for i in range(sample_num):
            wordid = senwords[i][0]
            tagid = senwords[i][1]
            lower_flag = senwords[i][2]
            ncvar_inputwords[frame_index] = wordid
            ncvar_inputfeats[frame_index, 0] = lower_flag
            ncvar_outputlabels[frame_index] = tagid
            frame_index += 1

        ncvar_seqnames[sen_index, 0:seqname_maxlength] = seqname
        ncvar_seqlengths[sen_index] = sample_num

        sen_index += 1
    nc.close()
Beispiel #33
0
#!/usr/bin/python
#coding=utf-8

from netCDF4 import Dataset as ds
import numpy as np
import sys

if(len(sys.argv)<=1):
	f1="train.nc"
else:
	f1=sys.argv[1]

print "checking netCDF file: "+f1

nc1=ds(f1,'r',format='NETCDF4')

print "\n-------dimension-------\n"
for dim in nc1.dimensions:
	print nc1.dimensions[dim]

print "\n-------variable metainfo-------\n"
for var in nc1.variables:
	print var+":",
	print nc1.variables[var].ndim, #data dim
	print nc1.variables[var].shape #shape

print "\n-------seq names-------\n"
print nc1.variables["seqTags"][:]

print "\n-------seq lengths-------\n"
print nc1.variables["seqLengths"][:]
def main():
	sentences=[]
	seq_num=0
	step_num=0

	assert feat_dim==4

	seqname_maxlength=10
	output_dim=len(tagdict)
	fi=open(INPUT_DATA)
	sen=[]
	for line in fi:
		line=line.strip()
		if(line==""):
			if(len(sen)>0):
				sentences.append(sen)
			sen=[]
			continue
		toks=line.split(" ")
		sen.append(toks)
		step_num+=1

	seq_num=len(sentences)
	#print "step_num: %d"%step_num
	#print "seq_num: %d"%seq_num
	#define netcdf file
	nc=ds(OUTPUT_FILE,"w",format="NETCDF4")
	nc.createDimension("seq_num",seq_num)
	nc.createDimension("step_num",step_num)
	nc.createDimension("feat_dim",feat_dim)
	nc.createDimension("output_dim",output_dim)
	nc.createDimension("seqname_maxlength",seqname_maxlength)
	
	ncvar_seqnames=nc.createVariable("seqTags","c",("seq_num","seqname_maxlength"))
	ncvar_seqlengths=nc.createVariable("seqLengths","i4",("seq_num"))
	ncvar_inputfeats=nc.createVariable("inputFeats","f4",("step_num","feat_dim"))
	ncvar_inputwords=nc.createVariable("inputWords","i4",("step_num"))
	ncvar_outputlabels=nc.createVariable("outputLabels","i4",("step_num"))

	frame_index=0
	sen_index=0
	for seninfo in sentences:
		seqname="%010d"%sen_index
		sample_num=len(seninfo) 
		for i in range(sample_num):
			word=seninfo[i][0]
			pos=seninfo[i][1]
			chunk=seninfo[i][2]
			tag=seninfo[i][4]

			wordl=word.lower()
			wordid=getwordid(wordl)
			tagid=tagdict[tag]

			allcaps=word.isupper()
			initcap=word[0].isupper()
			hascap=False
			if(not allcaps):
				for w in word:
					if(w.isupper()):
						hascap=True
						break
			else:
				hascap=True

			ncvar_inputwords[frame_index]=wordid

			ncvar_inputfeats[frame_index,:]=numpy.zeros(4)
			if(allcaps):
				ncvar_inputfeats[frame_index,0]=1
			elif(initcap):
				ncvar_inputfeats[frame_index,1]=1
			elif(hascap):
				ncvar_inputfeats[frame_index,2]=1
			else:
				ncvar_inputfeats[frame_index,3]=1

			ncvar_outputlabels[frame_index]=tagid
			frame_index+=1

		ncvar_seqnames[sen_index,0:seqname_maxlength]=seqname
		ncvar_seqlengths[sen_index]=sample_num
		sen_index+=1
	nc.close()

	print "wordcount: %d"%wordcount
	print "unkwordcount: %d"%unkwordcount
	print "oov rate: %f"%(float(unkwordcount)/wordcount)
def	main():
	#get basic information
	seqs_num=0
	step_num=0
	filename_maxlength=10
	input_dim=len(wordid_dict) # 1 additional dim is for startflag
	output_dim=len(wordid_dict) #additional tag ot represent the end tag
	starttagid=len(wordid_dict) #the start tag id
	fi=open(DATA_FILE,"r")
	for line in fi:
		line=line.strip()
		if(line==""):
			continue
		senwords=line.split(" ")
		step_num+=len(senwords)
		seqs_num+=1
	fi.close()
	#define netcdf file
	nc=ds(output_file,"w",format="NETCDF4")
	nc.createDimension("numSeqs",seqs_num)
	nc.createDimension("numTimesteps",step_num)
	nc.createDimension("inputPattSize",input_dim)
	nc.createDimension("maxSeqTagLength",filename_maxlength)
	nc.createDimension("numLabels",output_dim)

	ncvar_filenames=nc.createVariable("seqTags","c",("numSeqs","maxSeqTagLength"))
	ncvar_samplenums=nc.createVariable("seqLengths","i4",("numSeqs"))
	ncvar_inputs=nc.createVariable("inputs","i4",("numTimesteps"))
	ncvar_outputs=nc.createVariable("targetClasses","i4",("numTimesteps"))

	frame_index=0
	sen_index=0
	
	fi=open(DATA_FILE,"r")
	for line in fi:
		line=line.strip()
		if(line==""):
			continue
		senwords=line.split(" ")
		seqname="%010d"%sen_index
		sample_num=len(senwords) 

		for i in range(sample_num):
			wordid=getwordid(senwords[i])
			ncvar_inputs[frame_index]=wordid
			ncvar_outputs[frame_index]=wordid
			frame_index+=1

		ncvar_filenames[sen_index,0:filename_maxlength]=seqname
		ncvar_samplenums[sen_index]=sample_num

		sen_index+=1
		#if(sen_index%1000==0):
		#	print sen_index
			#break

	nc.close()
	fi.close()

	print "worddict size: %d"%len(wordid_dict)
	print "total_wordnum: %d"%total_wordnum
	print "unk_wordnum: %d"%unk_wordnum
	print "oov rate: %f"%(float(unk_wordnum)/total_wordnum)
def main():
    #get basic information
    seqs_num = 0
    step_num = 0
    filename_maxlength = 10
    input_dim = len(wordid_dict)  # 1 additional dim is for startflag
    output_dim = len(wordid_dict)  #additional tag ot represent the end tag
    starttagid = len(wordid_dict)  #the start tag id
    fi = open(DATA_FILE, "r")
    for line in fi:
        line = line.strip()
        if (line == ""):
            continue
        senwords = line.split(" ")
        step_num += len(senwords)
        seqs_num += 1
    fi.close()
    #define netcdf file
    nc = ds(output_file, "w", format="NETCDF4")
    nc.createDimension("numSeqs", seqs_num)
    nc.createDimension("numTimesteps", step_num)
    nc.createDimension("inputPattSize", input_dim)
    nc.createDimension("maxSeqTagLength", filename_maxlength)
    nc.createDimension("numLabels", output_dim)

    ncvar_filenames = nc.createVariable("seqTags", "c",
                                        ("numSeqs", "maxSeqTagLength"))
    ncvar_samplenums = nc.createVariable("seqLengths", "i4", ("numSeqs"))
    ncvar_inputs = nc.createVariable("inputs", "i4", ("numTimesteps"))
    ncvar_outputs = nc.createVariable("targetClasses", "i4", ("numTimesteps"))

    frame_index = 0
    sen_index = 0

    fi = open(DATA_FILE, "r")
    for line in fi:
        line = line.strip()
        if (line == ""):
            continue
        senwords = line.split(" ")
        seqname = "%010d" % sen_index
        sample_num = len(senwords)

        for i in range(sample_num):
            wordid = getwordid(senwords[i])
            ncvar_inputs[frame_index] = wordid
            ncvar_outputs[frame_index] = wordid
            frame_index += 1

        ncvar_filenames[sen_index, 0:filename_maxlength] = seqname
        ncvar_samplenums[sen_index] = sample_num

        sen_index += 1
        #if(sen_index%1000==0):
        #	print sen_index
        #break

    nc.close()
    fi.close()

    print "worddict size: %d" % len(wordid_dict)
    print "total_wordnum: %d" % total_wordnum
    print "unk_wordnum: %d" % unk_wordnum
    print "oov rate: %f" % (float(unk_wordnum) / total_wordnum)
def main():
	sentences=[]
	seq_num=0
	step_num=0
	seqname_maxlength=10
	output_dim=len(tagdict)
	subdirs=os.listdir(WSJDATA_DIR)
	for subdir in subdirs:
		#basename=subdir.lstrip("0")
		dataid=int(subdir)
		if(dataid<fromdataid):
			continue
		if(dataid>todataid and todataid>0):
			continue
		dirpath=WSJDATA_DIR+"/"+subdir
		#print subdir
		fs=os.listdir(dirpath)
		for f in fs:
			fp=dirpath+"/"+f
			fi=open(fp,"r")
			seninfo=[]
			for line in fi:
				line=line.strip()
				if(line==""):
					if(len(seninfo)>0):
						sentences.append(seninfo)
					seninfo=[]
					continue
				toks=line.split("\t")
				word=toks[0]
				pos=toks[1]
				wordl=word.lower()
				if(word==wordl):
					lower_flag=0
				else:
					lower_flag=1 # word contains capital letters
				wordid=worddict[wordl]
				tagid=tagdict[pos]
				seninfo.append((wordid,tagid,lower_flag))
				step_num+=1
			fi.close()
	seq_num=len(sentences)

	print "seq_num: %d"%seq_num
	print "step_num: %d"%step_num

	#define netcdf file
	nc=ds(OUTPUT_FILE,"w",format="NETCDF4")
	nc.createDimension("seq_num",seq_num)
	nc.createDimension("step_num",step_num)
	nc.createDimension("feat_dim",feat_dim)
	nc.createDimension("output_dim",output_dim)
	nc.createDimension("seqname_maxlength",seqname_maxlength)
	
	ncvar_seqnames=nc.createVariable("seqTags","c",("seq_num","seqname_maxlength"))
	ncvar_seqlengths=nc.createVariable("seqLengths","i4",("seq_num"))
	ncvar_inputfeats=nc.createVariable("inputFeats","f4",("step_num","feat_dim"))
	ncvar_inputwords=nc.createVariable("inputWords","i4",("step_num"))
	ncvar_outputlabels=nc.createVariable("outputLabels","i4",("step_num"))

	frame_index=0
	sen_index=0
	for senwords in sentences:
		seqname="%010d"%sen_index
		sample_num=len(senwords) 
		for i in range(sample_num):
			wordid=senwords[i][0]
			tagid=senwords[i][1]
			lower_flag=senwords[i][2]
			ncvar_inputwords[frame_index]=wordid
			ncvar_inputfeats[frame_index,0]=lower_flag
			ncvar_outputlabels[frame_index]=tagid
			frame_index+=1

		ncvar_seqnames[sen_index,0:seqname_maxlength]=seqname
		ncvar_seqlengths[sen_index]=sample_num

		sen_index+=1
	nc.close()