#and renames them so they resolve correctly as html files. import sec_edgar_downloader, os from sec_edgar_downloader import Downloader basepath = 'C:\\Users\\Dell\\OneDrive - George Mason University\\MBA 797\\Stock Data\\' SMIF_tickers = [ "GOOG", "AMZN", "BBT", "BA", "BMY", "CBRE", "CSCO", "C", "STZ", "CVA", "D", "XLE", "ESS", "FTNT", "GS", "HCP", "XLV", "HON", "JPM", "KSU", "LEN", "MSFT", "NEE", "PYPL", "PFE", "PNC", "RTN", "SYF", "TJX", "UNH", "VZ", "WMT", "DIS", "WDC" ] for i in range(0, len(SMIF_tickers)): dl = Downloader(basepath + SMIF_tickers[i]) dl.get_10k_filings(SMIF_tickers[i], 5) dl.get_10q_filings(SMIF_tickers[i], 4) for z in range(0, len(SMIF_tickers)): filelistK = os.listdir(basepath + SMIF_tickers[z] + "\\sec_edgar_filings\\" + SMIF_tickers[z] + "\\10-K\\") filelistQ = os.listdir(basepath + SMIF_tickers[z] + "\\sec_edgar_filings\\" + SMIF_tickers[z] + "\\10-Q\\") for i in range(0, len(filelistK)): os.rename( basepath + SMIF_tickers[z] + "\\sec_edgar_filings\\" + SMIF_tickers[z] + "\\10-K\\" + filelistK[i], basepath + SMIF_tickers[z] + "\\sec_edgar_filings\\" + SMIF_tickers[z] + "\\10-K\\" + filelistK[i][:-3] + "html") for i in range(0, len(filelistQ)):
def getFilings(ticker): # Get all 10-K and 10-Q filings for a ticker dl = Downloader(os.getcwd()) dl.get_10k_filings(ticker) dl.get_10q_filings(ticker) # Get the directories of the newly added files directoryK = os.getcwd() + "/sec_edgar_filings/" + ticker + "/10-K" directoryQ = os.getcwd() + "/sec_edgar_filings/" + ticker + "/10-Q" # Create dataframe to store information scraped from filings SECInfo = pd.DataFrame(columns=[ "Filing Type", "Filing Year", "Filing Date", "Net Income", "MDA Sentiment Analysis" ]) # For each new text file, go through and CLEAN IT! for filename in os.listdir(directoryK): if filename.endswith(".txt"): # Make a new cleaned file year = re.search('-(.*)-', filename).group(1) html = open(directoryK + "/" + filename) f = html.read() name = directoryK + "-cleaned" + "/" + ticker + "-" + year + "-" + "10K.txt" os.makedirs(os.path.dirname(name), exist_ok=True) # Store the sentiment of each word as the scraper goes through the MDA sentiment = [] # If there is an error, move onto the next file. try: # Convert the HTML to a readable format in the first file w = open(name, "w") w.write(html2text.html2text(f)) html.close() name2 = directoryK + "-MDA" + "/" + ticker + "-" + year + "-" + "10K-MDA.txt" os.makedirs(os.path.dirname(name2), exist_ok=True) w.close() # Convert the Readable Format to MDA in the second file wfile = open(name, "r") w = wfile.readlines() w2 = open(name2, "w") # For each line, check to see if it is the start of an MDA section or the start of the next section. flag = False for line in w: if flag or "discussion and analysis of" in line.lower( ).rstrip( ) or "management's discussion and analysis" in line.lower( ).rstrip(): # Make sure the line is legitimate and not all punctuation before adding if len(line) > 20 and count_punct( line) < 4 and " " in line: w2.write(line) flag = True # Conduct sentiment analysis pol_score = sid.polarity_scores(line) sentiment.append(pol_score["compound"]) if "financial statements and supplementary data" in line.lower( ).rstrip() or "statements and supplementary" in line.lower( ).rstrip(): flag = False # Get the time of the filing if "conformed period of report" in line.lower().rstrip(): filingDateRaw = line.lower().split("report: ", 1)[1][:8] filingDate = filingDateRaw[0:4] + "-" + filingDateRaw[ 4:6] + "-" + filingDateRaw[-2:] wfile.close() w2.close() # This is a placeholder value that I did not get to resolve netIncome = True try: SECInfo = SECInfo.append( { "Filing Type": "10-K", "Filing Year": year, "Filing Date": filingDate, "Net Income": netIncome, "MDA Sentiment Analysis": sentiment }, ignore_index=True) except UnboundLocalError: continue except (NotImplementedError, UnicodeEncodeError) as error: print("not implemented error for " + year) continue continue else: continue # This is the same loop as above except for 10-Q filings instead of 10-Ks. See thsoe comments. for filename in os.listdir(directoryQ): if filename.endswith(".txt"): year = re.search('-(.*)-', filename).group(1) html2 = open(directoryQ + "/" + filename) f = html2.read() name = directoryQ + "-cleaned" + "/" + ticker + "-" + year + "-" + "10Q.txt" print(name) flag = False os.makedirs(os.path.dirname(name), exist_ok=True) w = open(name, "w") try: w.write(html2text.html2text(f)) html2.close() name2 = directoryQ + "-MDA" + "/" + ticker + "-" + year + "-" + filename[ 14:20] + "-10Q-MDA.txt" os.makedirs(os.path.dirname(name2), exist_ok=True) w.close() wfile = open(name, "r") w = wfile.readlines() w2 = open(name2, "w") sentiment = [] flag = False for line in w: if flag or "s discussion and analysis of" in line.lower( ).rstrip( ) or "management's discussion and analysis" in line.lower( ).rstrip(): if len(line) > 20 and count_punct( line) < 5 and " " in line: w2.write(line) flag = True pol_score = sid.polarity_scores(line) sentiment.append(pol_score["compound"]) if "controls and procedures" in line.lower( ) or "in witness whereof" in line.lower( ) or "item 4." in line.lower(): flag = False # Get the time of the filing if "conformed period of report" in line.lower().rstrip(): filingDateRaw = line.lower().split("report: ", 1)[1][:8] filingDate = filingDateRaw[0:4] + "-" + filingDateRaw[ 4:6] + "-" + filingDateRaw[-2:] wfile.close() w2.close() SECInfo = SECInfo.append( { "Filing Type": "10-Q", "Filing Year": year, "Filing Date": filingDate, "Net Income": netIncome, "MDA Sentiment Analysis": sentiment }, ignore_index=True) except (NotImplementedError, UnicodeEncodeError) as error: w.close() print("not implemented error for " + year) continue continue else: continue # Convert the large DataFrame we have made to a CSV for later use. SECInfo.to_csv("sec_processed_filings/" + ticker + "-SEC-Information.csv")