Python loadFileの例、functionsData.loadFile Pythonの例

コード例 #1

0

ファイルを表示

ファイル: SVMs.py プロジェクト: tgarutti/Fin-SentiLex

def svmDictionaries():
    loughranDict = fd.loadFile(drive+'Loughran_McDonald_dict.pckl')
    benchNNDict = fd.loadFile(drive+'dictionary_benchNN.pckl')
    classNNDict = fd.loadFile(drive+'dictionary_classificationNN.pckl')
    regresNNDict = fd.loadFile(drive+'dictionary_regressionNN.pckl')
    dictionaries = [benchNNDict, classNNDict, regresNNDict]
    
    dictionaries = fSVM.filterDicts(loughranDict, dictionaries, 0.4)
    return dictionaries

コード例 #2

0

ファイルを表示

ファイル: functionsSVM.py プロジェクト: tgarutti/Fin-SentiLex

def getScores(filename, dictionaries, dict_names):
    dataset = fd.loadFile(filename)
    data = dict()
    for d in dict_names:
        data[d] = []
    for item in dataset:
        x1 = np.concatenate(np.row_stack(item[7])[:,0:2]).tolist()
        n = 6-len(x1)
        for i in range(n):
            x1.insert(0, 0)
        if item[7][0][0] != 0:
            vol_change = (item[6][0]-item[7][0][0])/item[7][0][0]
            if vol_change >=0:
                y = [item[4][0],item[5],vol_change,1]
            else:
                y = [item[4][0],item[5],vol_change,0]
    
            for i in range(len(dictionaries)):
                d = dictionaries[i]
                d_name = dict_names[i]
                if d_name == 'Classification' or d_name == 'Regression':
                    x2 = getOmega2(item[-1], d)
                else:
                    x2 = getOmega(item[-1], d)
                if len(x2) > 0:
                    info = [item[0], item[1]]
                    x = np.array(info+x1+x2+y)
                    data[d_name].append(x)
    return data

コード例 #3

0

ファイルを表示

def getDescriptives():
    n10KP = 0
    n10KN = 0
    n10QP = 0
    n10QN = 0
    words10KP = 0
    words10KN = 0
    words10QP = 0
    words10QN = 0
    for year in range(2000,2019):
        print(year)
        filename = loc+str(year)+"10X_final.pckl"
        dataset = fd.loadFile(filename)
        for item in dataset:
            nWords = f10X.wordCount(item[5])
            text = open(item[2],"r").read()
            f_type = f10X.getFileType(text)
            y = item[4]
            if f_type == "10-K":
                if y >= 0:
                    n10KP+=1
                    words10KP+=nWords
                else:
                    n10KN+=1
                    words10KN+=nWords
            elif f_type == "10-Q":
                if y >= 0:
                    n10QP+=1
                    words10QP+=nWords
                else:
                    n10QN+=1
                    words10QN+=nWords
    descriptives = [n10KP, n10KN, n10QP, n10QN, words10KP, words10KN, words10QP, words10QN]
    row_names = ['# of positive 10Ks','# of negative 10Ks','# of positive 10Qs','# of negative 10Qs','# of words in positive 10Ks','# of words in negative 10Ks','# of words in positive 10Qs','# of words in negative 10Qs']
    return pd.DataFrame(descriptives, index = row_names)

コード例 #4

0

ファイルを表示

ファイル: functions10X.py プロジェクト: tgarutti/master_thesis

def returnDictionary(dictionary, filename):
    yearly_list = fd.loadFile(filename)
    CIKs =[]
    for k in yearly_list:
        CIKs.append(k[1])
        text = cleanText(k[-1])
        count = collections.Counter(text)
        for key, value in count.items():
            if key not in dictionary:
                dictionary[key]['pos'] = rd.randint(10, 50)/100
                dictionary[key]['neg'] = rd.randint(10, 50)/100
                dictionary[key]['mp'] = 0
                dictionary[key]['vp'] = 0
                dictionary[key]['mn'] = 0
                dictionary[key]['vn'] = 0
                dictionary[key]['freq'] = value
                dictionary[key]['ndocs'] = 1
            else:
                dictionary[key]['freq'] = dictionary[key]['freq'] + value
                dictionary[key]['ndocs'] += 1
    return dictionary, CIKs
##############################################################################
##############################################################################

コード例 #5

0

ファイルを表示

ファイル: searchFiles.py プロジェクト: tgarutti/master_thesis

import numpy as np
import re
import pandas as pd
import functions10X as f10X
import functionsData as fd
import functionsNN as fNN
import functionsSVM as fSVM
import random as rd
import collections
import time
drive = "/Volumes/LaCie/Data/"
search = []
wc = fd.loadFile(drive + 'length.pckl')
pos10K = 0
neg10K = 0
pos10Q = 0
neg10Q = 0
wcPos10K = 0
wcNeg10K = 0
wcPos10Q = 0
wcNeg10Q = 0
for year in range(2000, 2015):
    print(year)
    f1 = drive + str(year) + "10X_final.pckl"
    dataset = fd.loadFile(f1)
    for item in dataset:
        wc_cik = wc[(wc[:, 2] == item[1])]
        wc_i = wc_cik[(wc_cik[:, 0] == item[0])]
        count = sum(wc_i[:, 1].astype(int))
        if item[5] >= 0:
            if '10-K' in item[2]:

コード例 #6

0

ファイルを表示

            #N = (batch_mat.sum(1)).mean()
            #batch_mat1 = batch_mat/N
            batch_dictDF = pd.DataFrame(batch_dict)
            m = [batch_dictDF.loc['mp'], batch_dictDF.loc['mn'], m_coef]
            v = [batch_dictDF.loc['vp'], batch_dictDF.loc['vn'], v_coef]
            y, y_hat, X = forwardPropagation(batch, batch_dict, batch_mat1, W)
            loss.append(fNN.crossEntropyLoss(y, y_hat))
            batch_dictDF, W, m_coef, v_coef = backPropagation(
                batch_dictDF, batch_mat, y, y_hat, W, X, m, v, N)
            d = batch_dictDF.to_dict()
            dictionary.update(d)
            end2 = time.time()
    return loss, W


#dictionary = fd.loadFile(drive+'dictionary_final.pckl')
#dictionary = fNN.initializeX(dictionary)
dictionary = fd.loadFile(drive + 'dictionary_benchNN.pckl')

W, m_coef, v_coef = initializeCoefficients()
batch_size, epochs = setHyperparameters()
loss = []
for year in range(2013, 2015):
    start = time.time()
    dataset = fd.loadFile(drive + str(year) + '10X_final.pckl')
    rd.shuffle(dataset)
    loss, W = runNeuralNetwork(dataset, W, m_coef, v_coef)
    end = time.time()
    print(end - start)
fd.saveFile(dictionary, drive + 'dictionary_benchNN.pckl')

コード例 #7

0

ファイルを表示

ファイル: SVMs.py プロジェクト: tgarutti/Fin-SentiLex

import time
from sklearn import svm
from sklearn import metrics

drive = '/Volumes/LaCie/Data/'
def svmDictionaries():
    loughranDict = fd.loadFile(drive+'Loughran_McDonald_dict.pckl')
    benchNNDict = fd.loadFile(drive+'dictionary_benchNN.pckl')
    classNNDict = fd.loadFile(drive+'dictionary_classificationNN.pckl')
    regresNNDict = fd.loadFile(drive+'dictionary_regressionNN.pckl')
    dictionaries = [benchNNDict, classNNDict, regresNNDict]
    
    dictionaries = fSVM.filterDicts(loughranDict, dictionaries, 0.4)
    return dictionaries

dictionaries = fd.loadFile(drive+'SVM_dictionaries.pckl')
dict_names = ['Loughran', 'Benchmark', 'Classification', 'Regression']

def SVMDataset(dictionaries, dict_names):
    train, test = dict(),dict()
    for d in dict_names:
        train[d] = []
        test[d] = []
    for year in range(2000,2015):
        print(year)
        filename = drive+str(year)+'10X_final.pckl'
        X = fSVM.getScores(filename, dictionaries, dict_names)
        for d in dict_names:
            train[d].extend(X[d])
    for year in range(2015,2019):
        print(year)

コード例 #8

0

ファイルを表示

            N = 0
            #N = (batch_mat.sum(1)).mean()
            #batch_mat1 = batch_mat/N
            batch_mat1 = fNN.tfidf2(batch_mat)
            batch_dictDF = pd.DataFrame(batch_dict)
            m = [batch_dictDF.loc['mp'],batch_dictDF.loc['mn'], Ms]
            v = [batch_dictDF.loc['vp'],batch_dictDF.loc['vn'], Vs]
            y, y_hat, X = forwardPropagation(batch, batch_dict, batch_mat1, coefficients)
            loss.append(fNN.MSELoss(y, y_hat))
            batch_dictDF, coefficients, Ms, Vs = backPropagation(batch_dictDF, batch_mat, y, y_hat, coefficients, X, m, v, N)
            d = batch_dictDF.to_dict()
            dictionary.update(d)
            end2 = time.time()
    return loss, coefficients
time.sleep(15000)
dictionary = fd.loadFile(drive+'dictionary_filtered.pckl')
dictionary = fNN.initializeX(dictionary)
#dictionary = fd.loadFile(drive+'dictionary_regressionNN.pckl')
n_docs = 276880

coefficients, Ms, Vs = initializeCoefficients()
batch_size, epochs = setHyperparameters()
loss = []
for year in range(2013,2015):
    start = time.time()
    dataset = fd.loadFile(drive+str(year)+'10X_final.pckl')
    rd.shuffle(dataset)
    loss, coefficients = runNeuralNetwork(dataset, coefficients, Ms, Vs)
    end = time.time()
    print(end-start)
fd.saveFile(dictionary, drive+'dictionary_regressionNN.pckl')

コード例 #9

0

ファイルを表示

ファイル: classificationNN.py プロジェクト: tgarutti/Fin-SentiLex

            m = [batch_dictDF.loc['mp'], batch_dictDF.loc['mn'], Ms]
            v = [batch_dictDF.loc['vp'], batch_dictDF.loc['vn'], Vs]
            y, y_hat, X = forwardPropagation(batch, batch_dict, batch_mat1,
                                             coefficients)
            loss.append(fNN.crossEntropyLoss(y, y_hat))
            batch_dictDF, coefficients, Ms, Vs = backPropagation(
                batch_dictDF, batch_mat, y, y_hat, coefficients, X, m, v, N)
            d = batch_dictDF.to_dict()
            dictionary.update(d)
            end2 = time.time()
    return loss, coefficients


#dictionary = fd.loadFile(drive+'dictionary_filtered.pckl')
#dictionary = fNN.initializeX(dictionary)
dictionary = fd.loadFile(drive + 'dictionary_classificationNN.pckl')
n_docs = 276880

coefficients, Ms, Vs = initializeCoefficients()
batch_size, epochs = setHyperparameters()
loss = []
for year in range(2007, 2015):
    start = time.time()
    dataset = fd.loadFile(drive + str(year) + '10X_final.pckl')
    rd.shuffle(dataset)
    loss, coefficients = runNeuralNetwork(dataset, coefficients, Ms, Vs)
    end = time.time()
    print(end - start)
fd.saveFile(dictionary, drive + 'dictionary_classificationNN.pckl')
fd.saveFile(coefficients, drive + 'coefficients_classificationNN.pckl')
fd.saveFile(Ms, drive + 'Ms_classificationNN.pckl')

コード例 #10

0

ファイルを表示

ファイル: teststuff.py プロジェクト: tgarutti/master_thesis

import functionsData as fd
import pandas as pd
descr_ciks = ['0000072971', '0001403161', '0000875320', '0001318605', \
              '0000078003', '0001021860', '0000879101', '0000019617', \
              '0000886982', '0000037996', '0000034088', '0000712515', \
              '0000732717', '0000320193', '0000789019', '0000106640', \
              '0001418091', '0001283699', '0000092380', '0001039684']
drive = '/Volumes/LaCie/Data/'
fullCIKs = fd.loadFile(drive+'CIKs_final.pckl')
desc = fd.ciksDescriptives(descr_ciks)

name_xlsx = drive+'descriptivesCIK.xlsx'
writer = pd.ExcelWriter(name_xlsx,engine='xlsxwriter')
workbook=writer.book
for key in desc.keys():
    worksheet=workbook.add_worksheet(key)
    writer.sheets[key] = worksheet
    
    worksheet.write_string(0, 0, 'General Descriptives')
    desc[key]['Descriptives'].to_excel(writer,sheet_name=key,startrow=1 , startcol=0)
    
    worksheet.write_string(desc[key]['Descriptives'].shape[0] + 4, 0, 'Quantiles')
    desc[key]['Quantiles'].to_excel(writer,sheet_name=key,startrow=desc[key]['Descriptives'].shape[0] + 5, startcol=0)
    
    worksheet.write_string(desc[key]['Descriptives'].shape[0] + 5 + desc[key]['Quantiles'].shape[0] + 4, 0, 'Periods')
    desc[key]['Periods'].to_excel(writer,sheet_name=key,startrow=desc[key]['Descriptives'].shape[0] + 5 + desc[key]['Quantiles'].shape[0] + 5, startcol=0)
writer.save()