def preprocess(self): """Preprocess the documents in Data/Unprocessed Finds a doc's CIKs and creates hard links in the folder Preprocessed/CIK. If a doc doesn't parse properly, it is moved to Data/Exceptions instead. The pre-processing step allows us to consider only one CIK at a time during the processing step, for memory efficiency. """ n_proc = 0 n_valid = 0 n_except = 0 start = time.time() os.chdir(self.DataDir + 'Unprocessed/') for (docpath, docname) in recursive_file_gen('.'): # Returns (path, filename) tuples for all files in directory # and subdirectories that don't begin with '.' or '_' if docname in self.processed_docs: continue self.processed_docs.add(docname) n_proc += 1 # Code assumes that docnames are unique try: (header, cik2filers, _) = rb_parser.parse_quarterly_filing(docpath) # Returns (but doesn't process) the raw text. date = header['FilingDate'] doctype = header['DocType'] for CIK in cik2filers.iterkeys(): new_docname = CIK + '_' + date + '.txt' ensure(self.DataDir + 'Preprocessed/' + CIK) safelink(docpath, self.DataDir + 'Preprocessed/' + CIK + '/' + new_docname) if new_docname in self.valid_docs: print "Repeated doc: %s" % new_docname self.valid_docs.add(new_docname) n_valid += 1 if n_valid != len(self.valid_docs): pass#debug() except ParseError as e: self.exception_docs.add(docname) n_except += 1 logging.warning(docname + ": " + str(e)) safelink(docpath, self.DataDir + 'Exceptions/' + basename(docpath)) # if n_proc > n_valid + n_except: # print "Warning: proc %d, valid %d, except %d" % (n_proc, n_valid, n_except) # elif n_proc % 100 == 0: # print "Proc %d, valid %d, except %d, combined %d" % (n_proc, n_valid, n_except, n_valid + n_except) # if n_proc != len(self.processed_docs) or n_valid != len(self.valid_docs) or n_except != len(self.exception_docs): # debug() end = time.time() print "Time elapsed in preprocessing: %.1f" % (end-start)
def process(self): start = time.time() os.chdir(self.DataDir + 'Preprocessed') # Iterate through all the preprocessed CIKs for CIK in os.listdir('.'): if CIK[0] == '.' or not os.path.isdir(CIK): continue if CIK in self.good_CIKs: self.active_CIKs.add(CIK) company = self.load_company(CIK) ensure(self.DataDir + 'Active/' + CIK) if CIK not in self.CIK2date: self.CIK2date[CIK] = [] for filing in os.listdir(CIK): filingpath = CIK + '/' + filing (header, filers, rawtext) = rb_parser.parse_quarterly_filing(filingpath) company.properties(filers) # Update company properties with info taken from the 'filers' part of the document date = header['FilingDate'] company.add_document(date, rawtext) # Creates a word dictionary and wordcount from the raw text returned by the parser self.CIK2date[CIK].append(date) self.active_docs.add(filing) os.rename(filingpath, self.DataDir + 'Active/' + filingpath) # Move the filing to the 'Active' directory - note this means atm all parsed data is stored in the directory structure company.build_wordset() self.company_word_sets.append(company.wordset) self.save_company(company) SIC = company.SIC try: if CIK not in self.industries[SIC]: self.industries[SIC].append(CIK) except KeyError: self.industries[SIC] = [CIK] del company # Get it out of memory. Probably unnecessary else: # if CIK not in self.goodCIKs self.inactive_CIKs.add(CIK) ensure(self.DataDir + 'Inactive/' + CIK) for filing in os.listdir(CIK): self.inactive_docs.add(filing) os.rename(CIK +'/'+ filing, self.DataDir + 'Inactive/' + CIK +'/'+ filing) os.removedirs(CIK) end = time.time() print "Time elapsed in processing: %.1f" % (end-start)