def getLinksFromHistory() -> list: """ Get file links from links_history file. Returns ------- list A list of links. """ committee_folder = credentials.getCommitteesDirectory() links_history_file = committee_folder + "../links_history.txt" history_links = [] with open(links_history_file, "r") as history_file: for link in history_file: cleaned_link = link.strip() if len(cleaned_link) > 0: history_links.append(link.strip()) return history_links
def buildCommittees(): """ Create a folder for every committee and place the folder inside committee_directory defined in the credentials file. """ committee_names = [ "Accounting Issues Committee",# "Best Practices Committee", "Audit Committee",# "Corporate Governance",# "Finance Committee", "Bylaws Committee",# "Communications Committee",# "Coordinating Committee Chairs Committee",# "Core Services Committee", "Education Committee",# "Information Systems Committee",# "IT Advisory and Governance",# "Legal Committee",# "Member Committee Advisory Committee",# "NCIGF Services Committee",# "Nominating Committee", "Operations Committee",# "Public Policy Committee",# "Site Selection Committee",# "Special Funding Committee",# "NCIGF Board"# ] committee_directory = credentials.getCommitteesDirectory() for committee_name in committee_names: committee_subfolder = Path(committee_directory + "/" + committee_name) committee_subfolder.mkdir(parents=True, exist_ok="True")
def cleanCommitteesFolder(): """ Delete everything in the downloadFolder so the script has a fresh start. """ committees_directory = credentials.getCommitteesDirectory() if not os.path.exists(committees_directory): return None committee_folders = os.listdir(committees_directory) for committee_folder in committee_folders: committee_folder_path = os.path.join(committees_directory, committee_folder) if os.path.isfile(committee_folder_path): os.unlink(committee_folder_path) continue for committee_file_name in os.listdir(committee_folder_path): committee_file_path = os.path.join(committee_folder_path, committee_file_name) try: if os.path.isfile(committee_file_path): os.unlink(committee_file_path) elif os.path.isdir(committee_file_path): shutil.rmtree(committee_file_path) except: logger.error("Could not clean old directory.")
def deleteHistory(): """ Delete the links_history archive file. """ committee_folder = credentials.getCommitteesDirectory() links_history_file = committee_folder + "../links_history.txt" if os.path.exists(links_history_file): os.unlink(links_history_file)
def saveLinksFromTaxonomy(links:list): """ Save links retrieved from getLinksFromTaxonomy into a links_history file. Parameters ---------- links : list A list of links. """ committee_folder = credentials.getCommitteesDirectory() links_history_file = committee_folder + "../links_history.txt" with open(links_history_file, "a") as history_file: links = ("".join([link, "\n"]) for link in links) history_file.writelines(links)
def downloadFile(file_href:str): """ Given a node link, download the actual file that belongs to the link and place it in the downloadFolder. Args: nodeHREF (str): A node link. Ex: https://member.ncigf.org/node/3544 downloadFolder (str, optional): Defaults to "/home/njennings/minutes_pdfs/". The folder to which the file will be placed. """ try: file_request = session.get(file_href, allow_redirects=True, stream=True) except urllib3.util.ssl_.SSLError: logger.error("Download failed. Something went wrong with requesting the file.") committee_directory = credentials.getCommitteesDirectory() file_name = file_href.split("/")[-1] local_path = committee_directory + file_name if "draft" in file_name.lower(): logger.warning("Download failed. Ignoring files with 'draft' in the name.") return None try: with open(local_path, mode="wb") as local_file: for file_chunk in file_request.iter_content(chunk_size=1024): if file_chunk: local_file.write(file_chunk) if os.path.getsize(local_path) > 1: return local_path else: logger.warning("Downloaded file will not be moved due to its tiny size.") return None except OSError: logger.warning("Failed to move file to committee. " + local_path) return None
def organizeFile(file_path:str): """ Attempt to retrieve the committee to which the file belonged and place it under the associated committee folder. Args: file (str): A file path to organize. Returns: str: An absolute file path """ local_file_path = downloadFile(file_path) if local_file_path == None: return None local_file_name = local_file_path.split("/")[-1] local_file_name_no_extension = local_file_name.split(".")[0] committee_directory = credentials.getCommitteesDirectory() local_file_pieces = local_file_name.split(".") local_file_name_extension = None if len(local_file_pieces) > 1: local_file_name_extension = local_file_pieces[-1] else: logger.warning(local_file_name + " Couldn't find file extension") return None invalid_extensions = ["msg", "doc"] if any(extension.lower() in local_file_name_extension for extension in invalid_extensions): logger.warning(local_file_name + " [" + local_file_name_extension + "] Not a valid file format") return None processed_text = None try: processed_text = textract.process(local_file_path) except Exception: logger.warning(local_file_name + " couldn't be processed into text") return None lines_in_local_file = processed_text.splitlines() if "aic" in local_file_name.lower(): committee_name = "Accounting Issues Committee/" elif "bod" in local_file_name.lower() or "directors" in local_file_name.lower(): committee_name = "NCIGF Board/" elif "mac" in local_file_name.lower(): committee_name = "Member Committee Advisory Committee/" elif "communication" in local_file_name.lower(): committee_name = "Communications Committee/" else: committee_name = determineCommittee(lines_in_local_file) if committee_name == None: logger.warning(local_file_name + " Could not determine committee") return None else: new_file_path_txt = committee_directory + committee_name + local_file_name_no_extension + ".txt" new_file_path_pdf = committee_directory + committee_name + local_file_name_no_extension + ".pdf" with open(new_file_path_txt, "wb") as new_file: new_file.write(processed_text) os.rename(local_file_path, new_file_path_pdf) return new_file_path_txt