def svmDictionaries(): loughranDict = fd.loadFile(drive+'Loughran_McDonald_dict.pckl') benchNNDict = fd.loadFile(drive+'dictionary_benchNN.pckl') classNNDict = fd.loadFile(drive+'dictionary_classificationNN.pckl') regresNNDict = fd.loadFile(drive+'dictionary_regressionNN.pckl') dictionaries = [benchNNDict, classNNDict, regresNNDict] dictionaries = fSVM.filterDicts(loughranDict, dictionaries, 0.4) return dictionaries
def getScores(filename, dictionaries, dict_names): dataset = fd.loadFile(filename) data = dict() for d in dict_names: data[d] = [] for item in dataset: x1 = np.concatenate(np.row_stack(item[7])[:,0:2]).tolist() n = 6-len(x1) for i in range(n): x1.insert(0, 0) if item[7][0][0] != 0: vol_change = (item[6][0]-item[7][0][0])/item[7][0][0] if vol_change >=0: y = [item[4][0],item[5],vol_change,1] else: y = [item[4][0],item[5],vol_change,0] for i in range(len(dictionaries)): d = dictionaries[i] d_name = dict_names[i] if d_name == 'Classification' or d_name == 'Regression': x2 = getOmega2(item[-1], d) else: x2 = getOmega(item[-1], d) if len(x2) > 0: info = [item[0], item[1]] x = np.array(info+x1+x2+y) data[d_name].append(x) return data
def getDescriptives(): n10KP = 0 n10KN = 0 n10QP = 0 n10QN = 0 words10KP = 0 words10KN = 0 words10QP = 0 words10QN = 0 for year in range(2000,2019): print(year) filename = loc+str(year)+"10X_final.pckl" dataset = fd.loadFile(filename) for item in dataset: nWords = f10X.wordCount(item[5]) text = open(item[2],"r").read() f_type = f10X.getFileType(text) y = item[4] if f_type == "10-K": if y >= 0: n10KP+=1 words10KP+=nWords else: n10KN+=1 words10KN+=nWords elif f_type == "10-Q": if y >= 0: n10QP+=1 words10QP+=nWords else: n10QN+=1 words10QN+=nWords descriptives = [n10KP, n10KN, n10QP, n10QN, words10KP, words10KN, words10QP, words10QN] row_names = ['# of positive 10Ks','# of negative 10Ks','# of positive 10Qs','# of negative 10Qs','# of words in positive 10Ks','# of words in negative 10Ks','# of words in positive 10Qs','# of words in negative 10Qs'] return pd.DataFrame(descriptives, index = row_names)
def returnDictionary(dictionary, filename): yearly_list = fd.loadFile(filename) CIKs =[] for k in yearly_list: CIKs.append(k[1]) text = cleanText(k[-1]) count = collections.Counter(text) for key, value in count.items(): if key not in dictionary: dictionary[key]['pos'] = rd.randint(10, 50)/100 dictionary[key]['neg'] = rd.randint(10, 50)/100 dictionary[key]['mp'] = 0 dictionary[key]['vp'] = 0 dictionary[key]['mn'] = 0 dictionary[key]['vn'] = 0 dictionary[key]['freq'] = value dictionary[key]['ndocs'] = 1 else: dictionary[key]['freq'] = dictionary[key]['freq'] + value dictionary[key]['ndocs'] += 1 return dictionary, CIKs ############################################################################## ##############################################################################
import numpy as np import re import pandas as pd import functions10X as f10X import functionsData as fd import functionsNN as fNN import functionsSVM as fSVM import random as rd import collections import time drive = "/Volumes/LaCie/Data/" search = [] wc = fd.loadFile(drive + 'length.pckl') pos10K = 0 neg10K = 0 pos10Q = 0 neg10Q = 0 wcPos10K = 0 wcNeg10K = 0 wcPos10Q = 0 wcNeg10Q = 0 for year in range(2000, 2015): print(year) f1 = drive + str(year) + "10X_final.pckl" dataset = fd.loadFile(f1) for item in dataset: wc_cik = wc[(wc[:, 2] == item[1])] wc_i = wc_cik[(wc_cik[:, 0] == item[0])] count = sum(wc_i[:, 1].astype(int)) if item[5] >= 0: if '10-K' in item[2]:
#N = (batch_mat.sum(1)).mean() #batch_mat1 = batch_mat/N batch_dictDF = pd.DataFrame(batch_dict) m = [batch_dictDF.loc['mp'], batch_dictDF.loc['mn'], m_coef] v = [batch_dictDF.loc['vp'], batch_dictDF.loc['vn'], v_coef] y, y_hat, X = forwardPropagation(batch, batch_dict, batch_mat1, W) loss.append(fNN.crossEntropyLoss(y, y_hat)) batch_dictDF, W, m_coef, v_coef = backPropagation( batch_dictDF, batch_mat, y, y_hat, W, X, m, v, N) d = batch_dictDF.to_dict() dictionary.update(d) end2 = time.time() return loss, W #dictionary = fd.loadFile(drive+'dictionary_final.pckl') #dictionary = fNN.initializeX(dictionary) dictionary = fd.loadFile(drive + 'dictionary_benchNN.pckl') W, m_coef, v_coef = initializeCoefficients() batch_size, epochs = setHyperparameters() loss = [] for year in range(2013, 2015): start = time.time() dataset = fd.loadFile(drive + str(year) + '10X_final.pckl') rd.shuffle(dataset) loss, W = runNeuralNetwork(dataset, W, m_coef, v_coef) end = time.time() print(end - start) fd.saveFile(dictionary, drive + 'dictionary_benchNN.pckl')
import time from sklearn import svm from sklearn import metrics drive = '/Volumes/LaCie/Data/' def svmDictionaries(): loughranDict = fd.loadFile(drive+'Loughran_McDonald_dict.pckl') benchNNDict = fd.loadFile(drive+'dictionary_benchNN.pckl') classNNDict = fd.loadFile(drive+'dictionary_classificationNN.pckl') regresNNDict = fd.loadFile(drive+'dictionary_regressionNN.pckl') dictionaries = [benchNNDict, classNNDict, regresNNDict] dictionaries = fSVM.filterDicts(loughranDict, dictionaries, 0.4) return dictionaries dictionaries = fd.loadFile(drive+'SVM_dictionaries.pckl') dict_names = ['Loughran', 'Benchmark', 'Classification', 'Regression'] def SVMDataset(dictionaries, dict_names): train, test = dict(),dict() for d in dict_names: train[d] = [] test[d] = [] for year in range(2000,2015): print(year) filename = drive+str(year)+'10X_final.pckl' X = fSVM.getScores(filename, dictionaries, dict_names) for d in dict_names: train[d].extend(X[d]) for year in range(2015,2019): print(year)
N = 0 #N = (batch_mat.sum(1)).mean() #batch_mat1 = batch_mat/N batch_mat1 = fNN.tfidf2(batch_mat) batch_dictDF = pd.DataFrame(batch_dict) m = [batch_dictDF.loc['mp'],batch_dictDF.loc['mn'], Ms] v = [batch_dictDF.loc['vp'],batch_dictDF.loc['vn'], Vs] y, y_hat, X = forwardPropagation(batch, batch_dict, batch_mat1, coefficients) loss.append(fNN.MSELoss(y, y_hat)) batch_dictDF, coefficients, Ms, Vs = backPropagation(batch_dictDF, batch_mat, y, y_hat, coefficients, X, m, v, N) d = batch_dictDF.to_dict() dictionary.update(d) end2 = time.time() return loss, coefficients time.sleep(15000) dictionary = fd.loadFile(drive+'dictionary_filtered.pckl') dictionary = fNN.initializeX(dictionary) #dictionary = fd.loadFile(drive+'dictionary_regressionNN.pckl') n_docs = 276880 coefficients, Ms, Vs = initializeCoefficients() batch_size, epochs = setHyperparameters() loss = [] for year in range(2013,2015): start = time.time() dataset = fd.loadFile(drive+str(year)+'10X_final.pckl') rd.shuffle(dataset) loss, coefficients = runNeuralNetwork(dataset, coefficients, Ms, Vs) end = time.time() print(end-start) fd.saveFile(dictionary, drive+'dictionary_regressionNN.pckl')
m = [batch_dictDF.loc['mp'], batch_dictDF.loc['mn'], Ms] v = [batch_dictDF.loc['vp'], batch_dictDF.loc['vn'], Vs] y, y_hat, X = forwardPropagation(batch, batch_dict, batch_mat1, coefficients) loss.append(fNN.crossEntropyLoss(y, y_hat)) batch_dictDF, coefficients, Ms, Vs = backPropagation( batch_dictDF, batch_mat, y, y_hat, coefficients, X, m, v, N) d = batch_dictDF.to_dict() dictionary.update(d) end2 = time.time() return loss, coefficients #dictionary = fd.loadFile(drive+'dictionary_filtered.pckl') #dictionary = fNN.initializeX(dictionary) dictionary = fd.loadFile(drive + 'dictionary_classificationNN.pckl') n_docs = 276880 coefficients, Ms, Vs = initializeCoefficients() batch_size, epochs = setHyperparameters() loss = [] for year in range(2007, 2015): start = time.time() dataset = fd.loadFile(drive + str(year) + '10X_final.pckl') rd.shuffle(dataset) loss, coefficients = runNeuralNetwork(dataset, coefficients, Ms, Vs) end = time.time() print(end - start) fd.saveFile(dictionary, drive + 'dictionary_classificationNN.pckl') fd.saveFile(coefficients, drive + 'coefficients_classificationNN.pckl') fd.saveFile(Ms, drive + 'Ms_classificationNN.pckl')
import functionsData as fd import pandas as pd descr_ciks = ['0000072971', '0001403161', '0000875320', '0001318605', \ '0000078003', '0001021860', '0000879101', '0000019617', \ '0000886982', '0000037996', '0000034088', '0000712515', \ '0000732717', '0000320193', '0000789019', '0000106640', \ '0001418091', '0001283699', '0000092380', '0001039684'] drive = '/Volumes/LaCie/Data/' fullCIKs = fd.loadFile(drive+'CIKs_final.pckl') desc = fd.ciksDescriptives(descr_ciks) name_xlsx = drive+'descriptivesCIK.xlsx' writer = pd.ExcelWriter(name_xlsx,engine='xlsxwriter') workbook=writer.book for key in desc.keys(): worksheet=workbook.add_worksheet(key) writer.sheets[key] = worksheet worksheet.write_string(0, 0, 'General Descriptives') desc[key]['Descriptives'].to_excel(writer,sheet_name=key,startrow=1 , startcol=0) worksheet.write_string(desc[key]['Descriptives'].shape[0] + 4, 0, 'Quantiles') desc[key]['Quantiles'].to_excel(writer,sheet_name=key,startrow=desc[key]['Descriptives'].shape[0] + 5, startcol=0) worksheet.write_string(desc[key]['Descriptives'].shape[0] + 5 + desc[key]['Quantiles'].shape[0] + 4, 0, 'Periods') desc[key]['Periods'].to_excel(writer,sheet_name=key,startrow=desc[key]['Descriptives'].shape[0] + 5 + desc[key]['Quantiles'].shape[0] + 5, startcol=0) writer.save()