コード例 #1
0
ファイル: SVMs.py プロジェクト: tgarutti/Fin-SentiLex
def svmDictionaries():
    loughranDict = fd.loadFile(drive+'Loughran_McDonald_dict.pckl')
    benchNNDict = fd.loadFile(drive+'dictionary_benchNN.pckl')
    classNNDict = fd.loadFile(drive+'dictionary_classificationNN.pckl')
    regresNNDict = fd.loadFile(drive+'dictionary_regressionNN.pckl')
    dictionaries = [benchNNDict, classNNDict, regresNNDict]
    
    dictionaries = fSVM.filterDicts(loughranDict, dictionaries, 0.4)
    return dictionaries
コード例 #2
0
ファイル: functionsSVM.py プロジェクト: tgarutti/Fin-SentiLex
def getScores(filename, dictionaries, dict_names):
    dataset = fd.loadFile(filename)
    data = dict()
    for d in dict_names:
        data[d] = []
    for item in dataset:
        x1 = np.concatenate(np.row_stack(item[7])[:,0:2]).tolist()
        n = 6-len(x1)
        for i in range(n):
            x1.insert(0, 0)
        if item[7][0][0] != 0:
            vol_change = (item[6][0]-item[7][0][0])/item[7][0][0]
            if vol_change >=0:
                y = [item[4][0],item[5],vol_change,1]
            else:
                y = [item[4][0],item[5],vol_change,0]
    
            for i in range(len(dictionaries)):
                d = dictionaries[i]
                d_name = dict_names[i]
                if d_name == 'Classification' or d_name == 'Regression':
                    x2 = getOmega2(item[-1], d)
                else:
                    x2 = getOmega(item[-1], d)
                if len(x2) > 0:
                    info = [item[0], item[1]]
                    x = np.array(info+x1+x2+y)
                    data[d_name].append(x)
    return data
コード例 #3
0
def getDescriptives():
    n10KP = 0
    n10KN = 0
    n10QP = 0
    n10QN = 0
    words10KP = 0
    words10KN = 0
    words10QP = 0
    words10QN = 0
    for year in range(2000,2019):
        print(year)
        filename = loc+str(year)+"10X_final.pckl"
        dataset = fd.loadFile(filename)
        for item in dataset:
            nWords = f10X.wordCount(item[5])
            text = open(item[2],"r").read()
            f_type = f10X.getFileType(text)
            y = item[4]
            if f_type == "10-K":
                if y >= 0:
                    n10KP+=1
                    words10KP+=nWords
                else:
                    n10KN+=1
                    words10KN+=nWords
            elif f_type == "10-Q":
                if y >= 0:
                    n10QP+=1
                    words10QP+=nWords
                else:
                    n10QN+=1
                    words10QN+=nWords
    descriptives = [n10KP, n10KN, n10QP, n10QN, words10KP, words10KN, words10QP, words10QN]
    row_names = ['# of positive 10Ks','# of negative 10Ks','# of positive 10Qs','# of negative 10Qs','# of words in positive 10Ks','# of words in negative 10Ks','# of words in positive 10Qs','# of words in negative 10Qs']
    return pd.DataFrame(descriptives, index = row_names)
コード例 #4
0
def returnDictionary(dictionary, filename):
    yearly_list = fd.loadFile(filename)
    CIKs =[]
    for k in yearly_list:
        CIKs.append(k[1])
        text = cleanText(k[-1])
        count = collections.Counter(text)
        for key, value in count.items():
            if key not in dictionary:
                dictionary[key]['pos'] = rd.randint(10, 50)/100
                dictionary[key]['neg'] = rd.randint(10, 50)/100
                dictionary[key]['mp'] = 0
                dictionary[key]['vp'] = 0
                dictionary[key]['mn'] = 0
                dictionary[key]['vn'] = 0
                dictionary[key]['freq'] = value
                dictionary[key]['ndocs'] = 1
            else:
                dictionary[key]['freq'] = dictionary[key]['freq'] + value
                dictionary[key]['ndocs'] += 1
    return dictionary, CIKs
##############################################################################
##############################################################################
コード例 #5
0
ファイル: searchFiles.py プロジェクト: tgarutti/master_thesis
import numpy as np
import re
import pandas as pd
import functions10X as f10X
import functionsData as fd
import functionsNN as fNN
import functionsSVM as fSVM
import random as rd
import collections
import time
drive = "/Volumes/LaCie/Data/"
search = []
wc = fd.loadFile(drive + 'length.pckl')
pos10K = 0
neg10K = 0
pos10Q = 0
neg10Q = 0
wcPos10K = 0
wcNeg10K = 0
wcPos10Q = 0
wcNeg10Q = 0
for year in range(2000, 2015):
    print(year)
    f1 = drive + str(year) + "10X_final.pckl"
    dataset = fd.loadFile(f1)
    for item in dataset:
        wc_cik = wc[(wc[:, 2] == item[1])]
        wc_i = wc_cik[(wc_cik[:, 0] == item[0])]
        count = sum(wc_i[:, 1].astype(int))
        if item[5] >= 0:
            if '10-K' in item[2]:
コード例 #6
0
            #N = (batch_mat.sum(1)).mean()
            #batch_mat1 = batch_mat/N
            batch_dictDF = pd.DataFrame(batch_dict)
            m = [batch_dictDF.loc['mp'], batch_dictDF.loc['mn'], m_coef]
            v = [batch_dictDF.loc['vp'], batch_dictDF.loc['vn'], v_coef]
            y, y_hat, X = forwardPropagation(batch, batch_dict, batch_mat1, W)
            loss.append(fNN.crossEntropyLoss(y, y_hat))
            batch_dictDF, W, m_coef, v_coef = backPropagation(
                batch_dictDF, batch_mat, y, y_hat, W, X, m, v, N)
            d = batch_dictDF.to_dict()
            dictionary.update(d)
            end2 = time.time()
    return loss, W


#dictionary = fd.loadFile(drive+'dictionary_final.pckl')
#dictionary = fNN.initializeX(dictionary)
dictionary = fd.loadFile(drive + 'dictionary_benchNN.pckl')

W, m_coef, v_coef = initializeCoefficients()
batch_size, epochs = setHyperparameters()
loss = []
for year in range(2013, 2015):
    start = time.time()
    dataset = fd.loadFile(drive + str(year) + '10X_final.pckl')
    rd.shuffle(dataset)
    loss, W = runNeuralNetwork(dataset, W, m_coef, v_coef)
    end = time.time()
    print(end - start)
fd.saveFile(dictionary, drive + 'dictionary_benchNN.pckl')
コード例 #7
0
ファイル: SVMs.py プロジェクト: tgarutti/Fin-SentiLex
import time
from sklearn import svm
from sklearn import metrics

drive = '/Volumes/LaCie/Data/'
def svmDictionaries():
    loughranDict = fd.loadFile(drive+'Loughran_McDonald_dict.pckl')
    benchNNDict = fd.loadFile(drive+'dictionary_benchNN.pckl')
    classNNDict = fd.loadFile(drive+'dictionary_classificationNN.pckl')
    regresNNDict = fd.loadFile(drive+'dictionary_regressionNN.pckl')
    dictionaries = [benchNNDict, classNNDict, regresNNDict]
    
    dictionaries = fSVM.filterDicts(loughranDict, dictionaries, 0.4)
    return dictionaries

dictionaries = fd.loadFile(drive+'SVM_dictionaries.pckl')
dict_names = ['Loughran', 'Benchmark', 'Classification', 'Regression']

def SVMDataset(dictionaries, dict_names):
    train, test = dict(),dict()
    for d in dict_names:
        train[d] = []
        test[d] = []
    for year in range(2000,2015):
        print(year)
        filename = drive+str(year)+'10X_final.pckl'
        X = fSVM.getScores(filename, dictionaries, dict_names)
        for d in dict_names:
            train[d].extend(X[d])
    for year in range(2015,2019):
        print(year)
コード例 #8
0
            N = 0
            #N = (batch_mat.sum(1)).mean()
            #batch_mat1 = batch_mat/N
            batch_mat1 = fNN.tfidf2(batch_mat)
            batch_dictDF = pd.DataFrame(batch_dict)
            m = [batch_dictDF.loc['mp'],batch_dictDF.loc['mn'], Ms]
            v = [batch_dictDF.loc['vp'],batch_dictDF.loc['vn'], Vs]
            y, y_hat, X = forwardPropagation(batch, batch_dict, batch_mat1, coefficients)
            loss.append(fNN.MSELoss(y, y_hat))
            batch_dictDF, coefficients, Ms, Vs = backPropagation(batch_dictDF, batch_mat, y, y_hat, coefficients, X, m, v, N)
            d = batch_dictDF.to_dict()
            dictionary.update(d)
            end2 = time.time()
    return loss, coefficients
time.sleep(15000)
dictionary = fd.loadFile(drive+'dictionary_filtered.pckl')
dictionary = fNN.initializeX(dictionary)
#dictionary = fd.loadFile(drive+'dictionary_regressionNN.pckl')
n_docs = 276880

coefficients, Ms, Vs = initializeCoefficients()
batch_size, epochs = setHyperparameters()
loss = []
for year in range(2013,2015):
    start = time.time()
    dataset = fd.loadFile(drive+str(year)+'10X_final.pckl')
    rd.shuffle(dataset)
    loss, coefficients = runNeuralNetwork(dataset, coefficients, Ms, Vs)
    end = time.time()
    print(end-start)
fd.saveFile(dictionary, drive+'dictionary_regressionNN.pckl')
コード例 #9
0
            m = [batch_dictDF.loc['mp'], batch_dictDF.loc['mn'], Ms]
            v = [batch_dictDF.loc['vp'], batch_dictDF.loc['vn'], Vs]
            y, y_hat, X = forwardPropagation(batch, batch_dict, batch_mat1,
                                             coefficients)
            loss.append(fNN.crossEntropyLoss(y, y_hat))
            batch_dictDF, coefficients, Ms, Vs = backPropagation(
                batch_dictDF, batch_mat, y, y_hat, coefficients, X, m, v, N)
            d = batch_dictDF.to_dict()
            dictionary.update(d)
            end2 = time.time()
    return loss, coefficients


#dictionary = fd.loadFile(drive+'dictionary_filtered.pckl')
#dictionary = fNN.initializeX(dictionary)
dictionary = fd.loadFile(drive + 'dictionary_classificationNN.pckl')
n_docs = 276880

coefficients, Ms, Vs = initializeCoefficients()
batch_size, epochs = setHyperparameters()
loss = []
for year in range(2007, 2015):
    start = time.time()
    dataset = fd.loadFile(drive + str(year) + '10X_final.pckl')
    rd.shuffle(dataset)
    loss, coefficients = runNeuralNetwork(dataset, coefficients, Ms, Vs)
    end = time.time()
    print(end - start)
fd.saveFile(dictionary, drive + 'dictionary_classificationNN.pckl')
fd.saveFile(coefficients, drive + 'coefficients_classificationNN.pckl')
fd.saveFile(Ms, drive + 'Ms_classificationNN.pckl')
コード例 #10
0
ファイル: teststuff.py プロジェクト: tgarutti/master_thesis
import functionsData as fd
import pandas as pd
descr_ciks = ['0000072971', '0001403161', '0000875320', '0001318605', \
              '0000078003', '0001021860', '0000879101', '0000019617', \
              '0000886982', '0000037996', '0000034088', '0000712515', \
              '0000732717', '0000320193', '0000789019', '0000106640', \
              '0001418091', '0001283699', '0000092380', '0001039684']
drive = '/Volumes/LaCie/Data/'
fullCIKs = fd.loadFile(drive+'CIKs_final.pckl')
desc = fd.ciksDescriptives(descr_ciks)

name_xlsx = drive+'descriptivesCIK.xlsx'
writer = pd.ExcelWriter(name_xlsx,engine='xlsxwriter')
workbook=writer.book
for key in desc.keys():
    worksheet=workbook.add_worksheet(key)
    writer.sheets[key] = worksheet
    
    worksheet.write_string(0, 0, 'General Descriptives')
    desc[key]['Descriptives'].to_excel(writer,sheet_name=key,startrow=1 , startcol=0)
    
    worksheet.write_string(desc[key]['Descriptives'].shape[0] + 4, 0, 'Quantiles')
    desc[key]['Quantiles'].to_excel(writer,sheet_name=key,startrow=desc[key]['Descriptives'].shape[0] + 5, startcol=0)
    
    worksheet.write_string(desc[key]['Descriptives'].shape[0] + 5 + desc[key]['Quantiles'].shape[0] + 4, 0, 'Periods')
    desc[key]['Periods'].to_excel(writer,sheet_name=key,startrow=desc[key]['Descriptives'].shape[0] + 5 + desc[key]['Quantiles'].shape[0] + 5, startcol=0)
writer.save()