Beispiel #1
0
def RunLearnAgg(dataset,a,batchsize,epochs,mod_type,h1_sz):

    run_dir=tabledir+'runs/'+str(dataset)+'/'
    ensure_dir(run_dir)
  
    f = open(run_dir+str(mod_type)+str(mod_type)+'_'+str(a)+'_'+str(epochs)+'.txt', 'w')
    f.write('Dataset: '+ str(dataset))
    f.write('\nModel Type: '+ str(mod_type))
    f.write('\nNumber of Epochs: '+ str(epochs))
      
    Xt=loadX(dataset,'train')
    Yt=loadY(dataset,'train')
    Xd=loadX(dataset,'test')
    Yd=loadY(dataset,'test')
    
    #Yt=loadYearMonth(dataset,'train')
    #Yd=loadYearMonth(dataset,'dev')
    
    
    n=Xt.shape[1]
    c=Yt.shape[1]
    x, y , y_,train_step=BuildLearnAlgo(n,c,a,mod_type,h1_sz)

    TrainLearnAlng(Xt,Yt,Xd,Yd,x, y , y_,train_step,epochs,batchsize,f)
    f.close()
def FixDates(start_date, end_date, metarchdircorr):

    ensure_dir(metarchdircorr)

    datelist = DateList(start_date, end_date)
    for date in datelist:

        filename = str(date[0]) + '_' + str(date[1]) + '.json'
        fdir = metarchdircorr + filename
        with open(fdir) as zfile:
            metacont = json.load(zfile)

        for article in metacont['docs']:
            if 'date' in article:
                if (len(article['date']) > 5):
                    article['date'] = [
                        article['date'][2], article['date'][6],
                        article['date'][10]
                    ]
                print(article['date'])
            else:
                article['date'] = [str(date[0]), str(date[1]), str(25)]

        with open(fdir, 'w') as rfile:
            json.dump(metacont, rfile)
Beispiel #3
0
def MonthlyStat(start_date, end_date, trainsize):
    datelist = DateList(start_date, end_date)
    datedict = MakeDictLabel(datelist)
    wordict = dict()
    wordind = dict()
    ensure_dir(tabledir)
    ensure_dir(metarchdir)

    for j in range(len(datelist)):
        date = datelist[j]
        print(date)
        print(len(wordict))
        filename = str(date[0]) + "_" + str(date[1]) + ".json"
        with open(metarchdir + filename) as zfile:
            metacont = json.load(zfile)

            for i in range(trainsize):
                wlist = metacont["docs"][i]["content"]
                for word in wlist:
                    if AcceptableString(word):
                        sword = ps.stem(word.lower())
                        dd = str(date[0]) + str(date[1])
                        if sword in wordict:
                            if dd in wordict[sword]:
                                wordict[sword][dd] += 1
                            else:
                                wordict[sword][dd] = 1
                        else:
                            wordict[sword] = {dd: 1}

    print("done reading")
    MWarr = np.zeros((len(datelist), len(wordict)), dtype=np.int32)
    wordarray = np.empty(len(wordict), dtype='<U20')

    i = 0
    for sword, dates in wordict.items():
        wordind[sword] = i
        wordarray[i] = sword
        for d, count in dates.items():
            MWarr[datedict[d], i] = count
        i += 1

    saveMWarr(
        MWarr, wordarray, datelist, 'MonthWord_' + start_date + '_' +
        end_date + '_' + str(trainsize) + '_2' + '.txt')

    return datelist, wordarray.tolist(), MWarr
Beispiel #4
0
def saveDataSparse(tabledir, AWarr_train, AWarr_dev, AWarr_test, Ylabels_train, Ylabels_dev, Ylabels_test, ftwords, \
 start_date, end_date, count_floor, tend, method, file_path, timerange):

    mldir = tabledir + 'ml_dir/'
    ensure_dir(mldir)
    fileint = []
    dirlist = os.listdir(mldir)
    for fn in dirlist:
        try:
            int(fn)
        except:
            dirlist.remove(fn)

    if (len(dirlist) == 0):
        os.makedirs(mldir + '1')
        outdir = '1' + '/'
    else:
        for filename in dirlist:
            fileint.append(int(filename))
        outdir = str(max(fileint) + 1) + '/'
        os.makedirs(mldir + outdir)

    outfile = mldir + outdir

    save_npz(outfile + 'AWarr_train.npz', csr_matrix(AWarr_train))
    save_npz(outfile + 'AWarr_dev.npz', csr_matrix(AWarr_dev))
    save_npz(outfile + 'AWarr_test.npz', csr_matrix(AWarr_test))

    np.savetxt(outfile + 'Ylabels_train.txt', Ylabels_train, fmt='%d')
    np.savetxt(outfile + 'Ylabels_dev.txt', Ylabels_dev, fmt='%d')
    np.savetxt(outfile + 'Ylabels_test.txt', Ylabels_test, fmt='%d')

    np.savetxt(outfile + 'Words.txt', ftwords, fmt='%s')

    f = open(outfile + 'config.txt', 'w')
    f.write('Start Date: ' + start_date)
    f.write('\nEnd Date: ' + end_date)
    f.write('\nTraining Set Size = ' + str(np.size(AWarr_train, axis=0)))
    f.write('\nDev Set Size = ' + str(np.size(AWarr_dev, axis=0)))
    f.write('\nTest Set Size = ' + str(np.size(AWarr_test, axis=0)))
    f.write('\nNumber of feature words = ' + str(len(ftwords)))
    f.write('\nFloor on total number of words =' + str(count_floor))
    f.write('\nWord selection method: ' + method)
    f.write('\nSource MonthWord File: ' + file_path)
    f.write('\nComputational Time: ' + str(tend / 60.0) + ' minutes ')
    f.write('\nTime range: ' + timerange)
    f.close()
def RemoveUselessWords(start_date, end_date):
    direc = './metarchdircorr/'
    ensure_dir(direc)

    datelist = DateList(start_date, end_date)
    for date in datelist:

        metacont, filename = readMetacont(date)
        #print(metacont)
        for article in metacont['docs']:
            for word in article['content']:
                s = word.encode('unicode-escape').decode('ascii')
                if re.match("^[a-zA-Z0-9.,()$-]*$", s) is None:
                    article['content'].remove(word)
                    #print(s)

        with open(direc + filename, 'w') as outfile:
            json.dump(metacont, outfile)
Beispiel #6
0
def NYTmetaquery(start_date, end_date) :

	DateArray = DateList(start_date, end_date)	
	ensure_dir(archdir)

	print('Running NYTmetaquery for date range: ', DateArray[0], DateArray[-1])
	for date in DateArray:
		print(date)

		name = "nyt_"+str(date[0])+"_"+str(date[1])+".json"
		
		if FileNotinDir(archdir, name): 
			request_string = "https://api.nytimes.com/svc/archive/v1/" + str(date[0]) + "/" + str(date[1]) +".json?api-key=" + apikey
			page = UrlRequest(request_string)

			if page:
				articles = json.loads(page.decode('utf-8'))
				with open(archdir+name, 'w') as outfile:
					json.dump(articles, outfile)
			else:
				print("page "+name+" was not read")
			time.sleep(3)	
Beispiel #7
0
count_floor = 100
method = 'sumvar'
MWfile = 'MonthWord_198701_201612_700.txt'
timerange = 'yearly'
load = 1
version = 52
debug = 0
save = 0
plot = 0
ML = 'NB'
trainsizelist = [800]
#num_wordlist = [7000]
thresh = 10
threshlist = [30]
statdir = gdrive_dir + 'statdir/'
ensure_dir(statdir)

err_train = []
err_test = []
err_dev = []
ptr = []
pte = []
pdv = []
ytr = []
yte = []
ydv = []
nullarticles_train = []
nullarticles_dev = []

print(str(trainsize))
for trainsize in trainsizelist:
Beispiel #8
0
from helpfunc import ensure_dir
from config import *

ensure_dir(metarchdir)
ensure_dir(archdir)
ensure_dir(artdir)
Beispiel #9
0
def MaketableFast(ftword, start_date, end_date, trainsize, devsize, testsize,
                  timerange):

    datelist = DateList(start_date, end_date)
    AWarr_train = np.zeros((trainsize * len(datelist), len(ftword)),
                           dtype=np.int16)
    AWarr_dev = np.zeros((devsize * len(datelist), len(ftword)),
                         dtype=np.int16)
    AWarr_test = np.zeros((testsize * len(datelist), len(ftword)),
                          dtype=np.int16)

    Ylabels_train = np.zeros(trainsize * len(datelist), dtype=np.int16)
    Ylabels_dev = np.zeros(devsize * len(datelist), dtype=np.int16)
    Ylabels_test = np.zeros(testsize * len(datelist), dtype=np.int16)

    if (timerange == "yearly"):
        datedict = MakeDictLabelYearly(datelist)
    elif (timerange == "monthly"):
        datedict = MakeDictLabel(datelist)
    else:
        print('Inexistent timerange')

    wordind = MakeDictIndex(ftword)
    ensure_dir(metarchdir)

    print(set(wordind))
    for j in range(len(datelist)):
        date = datelist[j]
        print(date)
        metacont, filename = readMetacont(date)

        artlist = metacont['docs']
        artnum = len(artlist)
        row0train = trainsize * j
        row0dev = devsize * j
        row0test = testsize * j

        for i in range(trainsize):
            wlist = artlist[i]["content"]
            Ylabels_train[row0train + i] = datedict[str(date[0]) +
                                                    str(int(date[1]))]
            for word in wlist:
                #sword = ps.stem(word.lower())
                if word in wordind:
                    AWarr_train[i + row0train, wordind[word]] += 1

        for i in range(devsize):
            wlist = artlist[i + trainsize]["content"]
            Ylabels_dev[row0dev + i] = datedict[str(date[0]) +
                                                str(int(date[1]))]
            for word in wlist:
                #sword = ps.stem(word.lower())
                if word in wordind:
                    AWarr_dev[i + row0dev, wordind[word]] += 1

        for i in range(testsize):
            wlist = artlist[i + trainsize + devsize]["content"]
            Ylabels_test[row0test + i] = datedict[str(date[0]) +
                                                  str(int(date[1]))]
            for word in wlist:
                #sword = ps.stem(word.lower())
                if word in wordind:
                    AWarr_test[i + row0test, wordind[word]] += 1

    return AWarr_train, AWarr_dev, AWarr_test, Ylabels_train, Ylabels_dev, Ylabels_test
Beispiel #10
0
def Maketable(ftword, start_date, end_date, trainsize, devsize):

    datelist = DateList(start_date, end_date)
    AWarr_train = np.zeros((trainsize * len(datelist), len(ftword) + 1),
                           dtype=np.int32)
    AWarr_dev = np.array([], dtype=np.int32)
    AWarr_test = np.array([], dtype=np.int32)

    Ylabels_train = np.zeros(trainsize * len(datelist), dtype=np.int32)
    Ylabels_dev = np.array([], dtype=np.int32)
    Ylabels_test = np.array([], dtype=np.int32)

    datedict = MakeDictLabel(datelist)
    wordind = MakeDictIndex(ftword)
    ensure_dir(metarchdir)
    devsize0 = devsize

    print(set(wordind))
    for j in range(len(datelist)):
        date = datelist[j]
        print(date)
        metacont, filename = readMetacont(date)

        if len(metacont['docs']) >= 1000:
            artlist = metacont['docs'][0:1000]
        else:
            artlist = metacont['docs']
            lenart = len(artlist)
            pdb.set_trace()
            devsize = np.int(
                np.round(devsize0 * (lenart - trainsize) / (1000 - trainsize)))

        row0train = trainsize * j
        artnum = len(artlist)
        AWarr_test_partial = np.zeros(
            (artnum - (trainsize + devsize), len(ftword)), dtype=np.int32)
        Ylabels_test_partial = np.zeros(artnum - (trainsize + devsize),
                                        dtype=np.int32)
        AWarr_dev_partial = np.zeros((devsize, len(ftword)), dtype=np.int32)
        Ylabels_dev_partial = np.zeros(devsize, dtype=np.int32)

        for i in range(trainsize):
            wlist = artlist[i]["content"]
            Ylabels_train[row0train + i] = datedict[str(date[0]) +
                                                    str(int(date[1]))]
            for word in wlist:
                sword = ps.stem(word.lower())
                if sword in wordind:
                    AWarr_train[i + row0train, wordind[sword]] += 1

        for i in range(devsize):
            wlist = artlist[i + trainsize]["content"]
            Ylabels_dev_partial[i] = datedict[str(date[0]) + str(int(date[1]))]
            for word in wlist:
                sword = ps.stem(word.lower())
                if sword in wordind:
                    AWarr_dev_partial[i, wordind[sword]] += 1

        for i in range(artnum - devsize - trainsize):
            #pdb.set_trace()
            wlist = artlist[i + trainsize + devsize]["content"]
            Ylabels_test_partial[i] = datedict[str(date[0]) +
                                               str(int(date[1]))]
            for word in wlist:
                sword = ps.stem(word.lower())
                if sword in wordind:
                    AWarr_test_partial[i, wordind[sword]] += 1

        if (np.size(AWarr_test) != 0):
            AWarr_test = np.concatenate((AWarr_test, AWarr_test_partial),
                                        axis=0)
            Ylabels_test = np.concatenate((Ylabels_test, Ylabels_test_partial),
                                          axis=0)
            AWarr_dev = np.concatenate((AWarr_dev, AWarr_dev_partial), axis=0)
            Ylabels_dev = np.concatenate((Ylabels_dev, Ylabels_dev_partial),
                                         axis=0)
        else:
            AWarr_test = AWarr_test_partial
            Ylabels_test = Ylabels_test_partial
            AWarr_dev = AWarr_dev_partial
            Ylabels_dev = Ylabels_dev_partial

    return AWarr_train, AWarr_dev, AWarr_test, Ylabels_train, Ylabels_dev, Ylabels_test