# store starting time for calculating total processing time time_start = datetime.datetime.now() # set path to folder Datasets/ path = 'D:/DataSets/' # the tag to be used for input data tag_org = 'NetIncomeLoss' # define years taking part in X and Y creation first = int(2011) last = int(2018) # load important lists index_cik=ds.list_from_file(path+'/reindexed/index_cik.txt') index_adsh=ds.list_from_file(path+'/reindexed/index_adsh.txt') index_tag=ds.list_from_file(path+'/reindexed/index_tag.txt') # get reindexed version of the tag tag_reindexed_int = index_tag.index(tag_org) # create supporting arrays XY_bool = np.zeros((len(index_cik), last-first+1), dtype = bool) XY_float = np.zeros((len(index_cik), last-first+1), dtype = float) # create empty array which will hold cik and FY year for each adsh adsh_cik_year = np.zeros((len(index_adsh),2), dtype=int) # for each adsh get cik and Full Report Year
# gets statistics on how often different tags are used in pre-filtered reports import datetime import csv import h5py as h5 import datasets_lib as ds # store starting time for calculating total processing time time_start = datetime.datetime.now() # set path to folder Datasets/ path = 'D:/DataSets/' # initialize tag_count list for counting tags index_tag = ds.list_from_file(path + '/reindexed/index_tag.txt') tag_count = [] for i in range(len(index_tag)): tag_count.append([0, i]) # create adsh_count for counting eligible adsh # collect tag's statistics with open(path + 'filter_1/filter_1_num.txt') as f: f_object = csv.reader(f, delimiter='\t') for row in f_object: tag = row[1] tag_count[int(tag)][0] += 1 tag_count.sort(reverse=True)
# set path to folder Datasets/ path = 'D:/DataSets/' # the folowing part constructs report_bool adn report_adsh matrixes with similar dimensions: # - column number corresponds to reindexed cik # - row number correponds to the year of the report (first eligible year correspondsm to row 0) # report_bool contains True if for the given cik and given year full year report exists # report_adsh contains adsh values (int) of those reports # first and last years to take into consideration start = 2008 end = 2019 # load cik and adsh lists list_cik = ds.list_from_file(path + 'reindexed/index_cik.txt') list_adsh = ds.list_from_file(path + 'reindexed/index_adsh.txt') # initialize matrixes report_bool = np.zeros((len(list_cik),end-start+1), dtype=bool) report_adsh = np.zeros((len(list_cik),end-start+1), dtype=int) # this list will remember the year for a given adsh adsh_year = [''] * len(list_adsh) # fills in both matrixes with open(path + '/reindexed/reindexed_sub.txt') as f: f_object = csv.reader(f, delimiter='\t') for row in f_object: adsh = row[0]