# store starting time for calculating total processing time
time_start = datetime.datetime.now()

# set path to folder Datasets/
path = 'D:/DataSets/'

# the tag to be used for input data
tag_org = 'NetIncomeLoss'

# define years taking part in X and Y creation 
first = int(2011)
last = int(2018)

# load important lists
index_cik=ds.list_from_file(path+'/reindexed/index_cik.txt')
index_adsh=ds.list_from_file(path+'/reindexed/index_adsh.txt')
index_tag=ds.list_from_file(path+'/reindexed/index_tag.txt')

# get reindexed version of the tag
tag_reindexed_int = index_tag.index(tag_org)

# create supporting arrays
XY_bool = np.zeros((len(index_cik), last-first+1), dtype = bool)
XY_float = np.zeros((len(index_cik), last-first+1), dtype = float)

# create empty array which will hold cik and FY year for each adsh
adsh_cik_year = np.zeros((len(index_adsh),2), dtype=int)

# for each adsh get cik and Full Report Year
Beispiel #2
0
# gets statistics on how often different tags are used in pre-filtered reports

import datetime
import csv
import h5py as h5
import datasets_lib as ds

# store starting time for calculating total processing time
time_start = datetime.datetime.now()

# set path to folder Datasets/
path = 'D:/DataSets/'

# initialize tag_count list for counting tags
index_tag = ds.list_from_file(path + '/reindexed/index_tag.txt')
tag_count = []

for i in range(len(index_tag)):
    tag_count.append([0, i])

# create adsh_count for counting eligible adsh

# collect tag's statistics
with open(path + 'filter_1/filter_1_num.txt') as f:
    f_object = csv.reader(f, delimiter='\t')
    for row in f_object:
        tag = row[1]
        tag_count[int(tag)][0] += 1

tag_count.sort(reverse=True)
# set path to folder Datasets/
path = 'D:/DataSets/'

# the folowing part constructs report_bool adn report_adsh matrixes with similar dimensions:
# - column number corresponds to reindexed cik
# - row number correponds to the year of the report (first eligible year correspondsm to row 0)
# report_bool contains True if for the given cik and given year full year report exists
# report_adsh contains adsh values (int) of those reports

# first and last years to take into consideration
start = 2008
end = 2019

# load cik and adsh lists
list_cik = ds.list_from_file(path + 'reindexed/index_cik.txt')
list_adsh = ds.list_from_file(path + 'reindexed/index_adsh.txt')

# initialize matrixes
report_bool = np.zeros((len(list_cik),end-start+1), dtype=bool)
report_adsh = np.zeros((len(list_cik),end-start+1), dtype=int)

# this list will remember the year for a given adsh
adsh_year = [''] * len(list_adsh)

# fills in both matrixes
with open(path + '/reindexed/reindexed_sub.txt') as f:
    f_object = csv.reader(f, delimiter='\t')
    for row in f_object:
        
        adsh = row[0]