def RunLearnAgg(dataset,a,batchsize,epochs,mod_type,h1_sz): run_dir=tabledir+'runs/'+str(dataset)+'/' ensure_dir(run_dir) f = open(run_dir+str(mod_type)+str(mod_type)+'_'+str(a)+'_'+str(epochs)+'.txt', 'w') f.write('Dataset: '+ str(dataset)) f.write('\nModel Type: '+ str(mod_type)) f.write('\nNumber of Epochs: '+ str(epochs)) Xt=loadX(dataset,'train') Yt=loadY(dataset,'train') Xd=loadX(dataset,'test') Yd=loadY(dataset,'test') #Yt=loadYearMonth(dataset,'train') #Yd=loadYearMonth(dataset,'dev') n=Xt.shape[1] c=Yt.shape[1] x, y , y_,train_step=BuildLearnAlgo(n,c,a,mod_type,h1_sz) TrainLearnAlng(Xt,Yt,Xd,Yd,x, y , y_,train_step,epochs,batchsize,f) f.close()
def FixDates(start_date, end_date, metarchdircorr): ensure_dir(metarchdircorr) datelist = DateList(start_date, end_date) for date in datelist: filename = str(date[0]) + '_' + str(date[1]) + '.json' fdir = metarchdircorr + filename with open(fdir) as zfile: metacont = json.load(zfile) for article in metacont['docs']: if 'date' in article: if (len(article['date']) > 5): article['date'] = [ article['date'][2], article['date'][6], article['date'][10] ] print(article['date']) else: article['date'] = [str(date[0]), str(date[1]), str(25)] with open(fdir, 'w') as rfile: json.dump(metacont, rfile)
def MonthlyStat(start_date, end_date, trainsize): datelist = DateList(start_date, end_date) datedict = MakeDictLabel(datelist) wordict = dict() wordind = dict() ensure_dir(tabledir) ensure_dir(metarchdir) for j in range(len(datelist)): date = datelist[j] print(date) print(len(wordict)) filename = str(date[0]) + "_" + str(date[1]) + ".json" with open(metarchdir + filename) as zfile: metacont = json.load(zfile) for i in range(trainsize): wlist = metacont["docs"][i]["content"] for word in wlist: if AcceptableString(word): sword = ps.stem(word.lower()) dd = str(date[0]) + str(date[1]) if sword in wordict: if dd in wordict[sword]: wordict[sword][dd] += 1 else: wordict[sword][dd] = 1 else: wordict[sword] = {dd: 1} print("done reading") MWarr = np.zeros((len(datelist), len(wordict)), dtype=np.int32) wordarray = np.empty(len(wordict), dtype='<U20') i = 0 for sword, dates in wordict.items(): wordind[sword] = i wordarray[i] = sword for d, count in dates.items(): MWarr[datedict[d], i] = count i += 1 saveMWarr( MWarr, wordarray, datelist, 'MonthWord_' + start_date + '_' + end_date + '_' + str(trainsize) + '_2' + '.txt') return datelist, wordarray.tolist(), MWarr
def saveDataSparse(tabledir, AWarr_train, AWarr_dev, AWarr_test, Ylabels_train, Ylabels_dev, Ylabels_test, ftwords, \ start_date, end_date, count_floor, tend, method, file_path, timerange): mldir = tabledir + 'ml_dir/' ensure_dir(mldir) fileint = [] dirlist = os.listdir(mldir) for fn in dirlist: try: int(fn) except: dirlist.remove(fn) if (len(dirlist) == 0): os.makedirs(mldir + '1') outdir = '1' + '/' else: for filename in dirlist: fileint.append(int(filename)) outdir = str(max(fileint) + 1) + '/' os.makedirs(mldir + outdir) outfile = mldir + outdir save_npz(outfile + 'AWarr_train.npz', csr_matrix(AWarr_train)) save_npz(outfile + 'AWarr_dev.npz', csr_matrix(AWarr_dev)) save_npz(outfile + 'AWarr_test.npz', csr_matrix(AWarr_test)) np.savetxt(outfile + 'Ylabels_train.txt', Ylabels_train, fmt='%d') np.savetxt(outfile + 'Ylabels_dev.txt', Ylabels_dev, fmt='%d') np.savetxt(outfile + 'Ylabels_test.txt', Ylabels_test, fmt='%d') np.savetxt(outfile + 'Words.txt', ftwords, fmt='%s') f = open(outfile + 'config.txt', 'w') f.write('Start Date: ' + start_date) f.write('\nEnd Date: ' + end_date) f.write('\nTraining Set Size = ' + str(np.size(AWarr_train, axis=0))) f.write('\nDev Set Size = ' + str(np.size(AWarr_dev, axis=0))) f.write('\nTest Set Size = ' + str(np.size(AWarr_test, axis=0))) f.write('\nNumber of feature words = ' + str(len(ftwords))) f.write('\nFloor on total number of words =' + str(count_floor)) f.write('\nWord selection method: ' + method) f.write('\nSource MonthWord File: ' + file_path) f.write('\nComputational Time: ' + str(tend / 60.0) + ' minutes ') f.write('\nTime range: ' + timerange) f.close()
def RemoveUselessWords(start_date, end_date): direc = './metarchdircorr/' ensure_dir(direc) datelist = DateList(start_date, end_date) for date in datelist: metacont, filename = readMetacont(date) #print(metacont) for article in metacont['docs']: for word in article['content']: s = word.encode('unicode-escape').decode('ascii') if re.match("^[a-zA-Z0-9.,()$-]*$", s) is None: article['content'].remove(word) #print(s) with open(direc + filename, 'w') as outfile: json.dump(metacont, outfile)
def NYTmetaquery(start_date, end_date) : DateArray = DateList(start_date, end_date) ensure_dir(archdir) print('Running NYTmetaquery for date range: ', DateArray[0], DateArray[-1]) for date in DateArray: print(date) name = "nyt_"+str(date[0])+"_"+str(date[1])+".json" if FileNotinDir(archdir, name): request_string = "https://api.nytimes.com/svc/archive/v1/" + str(date[0]) + "/" + str(date[1]) +".json?api-key=" + apikey page = UrlRequest(request_string) if page: articles = json.loads(page.decode('utf-8')) with open(archdir+name, 'w') as outfile: json.dump(articles, outfile) else: print("page "+name+" was not read") time.sleep(3)
count_floor = 100 method = 'sumvar' MWfile = 'MonthWord_198701_201612_700.txt' timerange = 'yearly' load = 1 version = 52 debug = 0 save = 0 plot = 0 ML = 'NB' trainsizelist = [800] #num_wordlist = [7000] thresh = 10 threshlist = [30] statdir = gdrive_dir + 'statdir/' ensure_dir(statdir) err_train = [] err_test = [] err_dev = [] ptr = [] pte = [] pdv = [] ytr = [] yte = [] ydv = [] nullarticles_train = [] nullarticles_dev = [] print(str(trainsize)) for trainsize in trainsizelist:
from helpfunc import ensure_dir from config import * ensure_dir(metarchdir) ensure_dir(archdir) ensure_dir(artdir)
def MaketableFast(ftword, start_date, end_date, trainsize, devsize, testsize, timerange): datelist = DateList(start_date, end_date) AWarr_train = np.zeros((trainsize * len(datelist), len(ftword)), dtype=np.int16) AWarr_dev = np.zeros((devsize * len(datelist), len(ftword)), dtype=np.int16) AWarr_test = np.zeros((testsize * len(datelist), len(ftword)), dtype=np.int16) Ylabels_train = np.zeros(trainsize * len(datelist), dtype=np.int16) Ylabels_dev = np.zeros(devsize * len(datelist), dtype=np.int16) Ylabels_test = np.zeros(testsize * len(datelist), dtype=np.int16) if (timerange == "yearly"): datedict = MakeDictLabelYearly(datelist) elif (timerange == "monthly"): datedict = MakeDictLabel(datelist) else: print('Inexistent timerange') wordind = MakeDictIndex(ftword) ensure_dir(metarchdir) print(set(wordind)) for j in range(len(datelist)): date = datelist[j] print(date) metacont, filename = readMetacont(date) artlist = metacont['docs'] artnum = len(artlist) row0train = trainsize * j row0dev = devsize * j row0test = testsize * j for i in range(trainsize): wlist = artlist[i]["content"] Ylabels_train[row0train + i] = datedict[str(date[0]) + str(int(date[1]))] for word in wlist: #sword = ps.stem(word.lower()) if word in wordind: AWarr_train[i + row0train, wordind[word]] += 1 for i in range(devsize): wlist = artlist[i + trainsize]["content"] Ylabels_dev[row0dev + i] = datedict[str(date[0]) + str(int(date[1]))] for word in wlist: #sword = ps.stem(word.lower()) if word in wordind: AWarr_dev[i + row0dev, wordind[word]] += 1 for i in range(testsize): wlist = artlist[i + trainsize + devsize]["content"] Ylabels_test[row0test + i] = datedict[str(date[0]) + str(int(date[1]))] for word in wlist: #sword = ps.stem(word.lower()) if word in wordind: AWarr_test[i + row0test, wordind[word]] += 1 return AWarr_train, AWarr_dev, AWarr_test, Ylabels_train, Ylabels_dev, Ylabels_test
def Maketable(ftword, start_date, end_date, trainsize, devsize): datelist = DateList(start_date, end_date) AWarr_train = np.zeros((trainsize * len(datelist), len(ftword) + 1), dtype=np.int32) AWarr_dev = np.array([], dtype=np.int32) AWarr_test = np.array([], dtype=np.int32) Ylabels_train = np.zeros(trainsize * len(datelist), dtype=np.int32) Ylabels_dev = np.array([], dtype=np.int32) Ylabels_test = np.array([], dtype=np.int32) datedict = MakeDictLabel(datelist) wordind = MakeDictIndex(ftword) ensure_dir(metarchdir) devsize0 = devsize print(set(wordind)) for j in range(len(datelist)): date = datelist[j] print(date) metacont, filename = readMetacont(date) if len(metacont['docs']) >= 1000: artlist = metacont['docs'][0:1000] else: artlist = metacont['docs'] lenart = len(artlist) pdb.set_trace() devsize = np.int( np.round(devsize0 * (lenart - trainsize) / (1000 - trainsize))) row0train = trainsize * j artnum = len(artlist) AWarr_test_partial = np.zeros( (artnum - (trainsize + devsize), len(ftword)), dtype=np.int32) Ylabels_test_partial = np.zeros(artnum - (trainsize + devsize), dtype=np.int32) AWarr_dev_partial = np.zeros((devsize, len(ftword)), dtype=np.int32) Ylabels_dev_partial = np.zeros(devsize, dtype=np.int32) for i in range(trainsize): wlist = artlist[i]["content"] Ylabels_train[row0train + i] = datedict[str(date[0]) + str(int(date[1]))] for word in wlist: sword = ps.stem(word.lower()) if sword in wordind: AWarr_train[i + row0train, wordind[sword]] += 1 for i in range(devsize): wlist = artlist[i + trainsize]["content"] Ylabels_dev_partial[i] = datedict[str(date[0]) + str(int(date[1]))] for word in wlist: sword = ps.stem(word.lower()) if sword in wordind: AWarr_dev_partial[i, wordind[sword]] += 1 for i in range(artnum - devsize - trainsize): #pdb.set_trace() wlist = artlist[i + trainsize + devsize]["content"] Ylabels_test_partial[i] = datedict[str(date[0]) + str(int(date[1]))] for word in wlist: sword = ps.stem(word.lower()) if sword in wordind: AWarr_test_partial[i, wordind[sword]] += 1 if (np.size(AWarr_test) != 0): AWarr_test = np.concatenate((AWarr_test, AWarr_test_partial), axis=0) Ylabels_test = np.concatenate((Ylabels_test, Ylabels_test_partial), axis=0) AWarr_dev = np.concatenate((AWarr_dev, AWarr_dev_partial), axis=0) Ylabels_dev = np.concatenate((Ylabels_dev, Ylabels_dev_partial), axis=0) else: AWarr_test = AWarr_test_partial Ylabels_test = Ylabels_test_partial AWarr_dev = AWarr_dev_partial Ylabels_dev = Ylabels_dev_partial return AWarr_train, AWarr_dev, AWarr_test, Ylabels_train, Ylabels_dev, Ylabels_test