def CreateTrainDev(in_direct, out_direct_train, out_direct_test, train_ratio, test_ratio, train_doc_file, out_train_tbl): cores = 2 kerntype = "epanech" kern_dist = 100000 file_type = "lgl" def SplitFiles(in_direct, out_direct_train, out_direct_test, train_ratio, test_ratio): train_files = 0 test_files = 0 if not os.path.exists(out_direct_train): os.makedirs(out_direct_train) if not os.path.exists(out_direct_test): os.makedirs(out_direct_test) files = os.listdir(in_direct) shuffle(files) #print files for f in files: test_xml = in_direct + "/" + f if float(test_files) <= float(train_files) * test_ratio and train_files >= 1: ouf = out_direct_test+"/"+f with io.open(ouf, 'w', encoding='utf-8') as w: rf = io.open(test_xml, 'r', encoding='utf-8') w.write(rf.read()) rf.close() test_files += 1 else: ouf = out_direct_train+"/"+f with io.open(ouf, 'w', encoding='utf-8') as w: rf = io.open(test_xml, 'r', encoding='utf-8') w.write(rf.read()) rf.close() train_files += 1 print "Test Files :", test_files print "Train Files: ", train_files SplitFiles(in_direct, out_direct_train, out_direct_test, train_ratio, test_ratio) #Create Pseudo-Documents from Toponyms import PseudoDocCreator as PSD PSD.calc(out_direct_train, train_doc_file, 100, file_type) #Create Document Table import LoadDBV1 as LDB LDB.Load(train_doc_file, out_train_tbl, conn_info, "wiki") #Calc Local Stats Tables import LocalSpatialStatsV1 as LS train_tbl_part1 = out_train_tbl+"_kernel"+str(kern_dist/1000)+"k"+"_"+kerntype LS.calc(train_doc_file, "gi", out_train_tbl, gtbl, conn_info, "DummyOutfile.txt", train_tbl_part1, kern_dist, kerntype, "wiki", False, "any", 0, cores, False) #LSS.calc(f, statistic, dtbl, gtbl, conn_info, outf, out_tbl, kern_dist, kerntype, traintype, listuse, whitelist_file, grid_min, cores, include_zero) train_tbl = train_tbl_part1 + "_gi" return train_tbl, out_direct_test
except: print "Did not provide a valid kerntype option, defaulting to uniform" kerntype = "uniform" #Should probabilities of zero be written to tbl? (yes for similarity scores, no for Top Resolver) try: include_zero = args[args.index('-include_zero')+1] if include_zero.lower() == "false": include_zero = False else: include_zero = True except: print "Did not provide include zero argument, defaulting to True" include_zero = True LSS.calc(f, statistic, dtbl, gtbl, conn_info, outf, out_tbl, kern_dist, kerntype, traintype, listuse, whitelist_file, grid_min, cores, include_zero) ##########################Load a database with | Doc ID | Geometry | table##################### if mode_arg.lower() == "loaddb": import LoadDBV1 as loadDB print "Starting DB Load Process" if '-tf' in args: f = args[args.index("-tf")+1] elif '-df' in args: f = args[args.index("-df")+1] elif '-tstf' in args: f = args[args.index("-tstf")+1] try: