def check_directory(directory, check_type): '''Checks to see if a passed directory path exits Parameters ---------- Directory (string / os.path): Directory to be checked Check_type (string) One of the argparse flags Returns ------- Directory (string / os.path): Path returned if valid or exception if invalid ''' directory_absolute_path = os.path.abspath(directory) if os.path.isdir(directory_absolute_path): return (directory_absolute_path) else: util.critical( 'Invalid --{0} location. Please ensure directory exists before proceeding:\n\t{1}' .format(check_type, directory))
def lane_merged_subfolder(working_directory): '''Creates a "lane_merged" subfolder within the working directory. Parameters ---------- working_directory (string / os.path): Location of input files. Returns ------- Boolean (True / False): Can user place files within created subfolder? subfolder (string / os.path): Full path of created subfolder. ''' subfolder = os.path.join(working_directory, 'lane_merged') util.info('Attempting to create "lane_merged_files" folder within {0}'.format(working_directory)) try: os.mkdir(subfolder) except(OSError): if os.path.isdir(subfolder): util.info('Directory already exists: {0}'.format(subfolder)) return(True, subfolder) else: util.critical('Unable to create directory: {0}. Cannot proceed further'.format(subfolder)) return(False, subfolder) else: util.info('Subfolder creation successful: {0}'.format(subfolder)) return(True, subfolder)
def gzip_file_list(working_directory): '''Gets list of all .fq.gz files in the working directory (except .lostreads.fq.gz files) Parameters ---------- working_directory (string / os.path): Directory to search for fq.gz files Returns ------- fastq_gz_files(list) List object containing .fq.gz filenames (i.e. sample files) ''' list_files = os.listdir(working_directory) fastq_gz_files = [] for files in list_files: if files.endswith('.lostreads.fq.gz'): pass elif files.endswith('.fq.gz'): fastq_gz_files.append(files) else: continue if len(list_files) == 0: util.critical( 'There are no gzipped fastq (FILENMAE.fq.gz) files within specified directory' ) util.info('List of gzipped fastq files read into script') return (fastq_gz_files)
def glob_lister(submission_form): '''Reads in the CRUKCI_SLX_Submission.xlsx file and returns the SLX number and fq.gz file prefixes. Parameters ---------- filepath (string / os.path): File location for glob variables. Preferably full path. Returns ------- samples_information (pandas dataframe): Pandas dataframe containing the file index (prefix) slx_id (string): SLX ID from CRUKCI ''' file_check = os.path.isfile(submission_form) if file_check == True: util.info('Submission form found at {0}'.format(submission_form)) else: error_message = 'Submission form not found, please ensure this file is in\n-->\t{0}'.format( working_directory) util.critical(error_message) util.info('Reading in index file') excel_file = pandas.read_excel( submission_form ) # Find the start row to skip passed the header information util.info('Index file read') slx_id_raw = excel_file[excel_file[1] == 'SLX Identifier'] slx_row = slx_id_raw.index[0] if excel_file.iloc[slx_row]['Unnamed: 2'].startswith('SLX-'): slx_id = excel_file.iloc[slx_row]['Unnamed: 2'] else: slx_id = 'SLX-{0}'.format(excel_file.iloc[slx_row]['Unnamed: 2']) excel_file_find_start = excel_file[excel_file[1] == 'Name'] excel_file_start = int(excel_file_find_start.index[0]) + 1 samples_information = pandas.read_excel(submission_form, header=excel_file_start, usecols=['Name', 'Index']) samples_information['Index'] = samples_information['Index'].str.replace( '-', '_') return (samples_information, slx_id)
def check_rRNA_library(rRNA_genome_path): '''Checks to see if a passed rRNA library exits Parameters ---------- rRNA_genome_path (string / os.path): Filepath to be checked Returns ------- Directory (string / os.path): Path returned if valid or error if invalid - terminating this script ''' directory_library = os.path.split(rRNA_genome_path) directory_checked = check_directory(directory_library[0], 'rRNA_library') if not os.path.isfile(rRNA_genome_path): util.critical( 'Unable to locate rRNA library (.fa file) with specified rRNA_library path' ) list_files = os.listdir(directory_checked) list_checked_list = [] for_bowtie2_path = os.path.splitext(rRNA_genome_path) bowtie_file_reference = os.path.split(for_bowtie2_path[0]) for files in list_files: if files.startswith( bowtie_file_reference[1]) and files.endswith('.bt2'): list_checked_list.append(True) else: list_checked_list.append(False) list_checked = set(list_checked_list) if True in list_checked: util.info('rRNA library path appears valid') return (rRNA_genome_path) else: util.critical( 'Unable to locate rRNA library (bt2 files) within specified rRNA_library path: {0}' .format(directory_library[0])) return ()
def lane_merger(working_directory, files_to_merge, lane_tags): '''Performs the merging of the input files. zcat to read in, pigz to create the merged file - multi-threaded alternative to gzip for faster compression. Parameters ---------- working_directory (string / os.path) Location of input files. files_to_merge (Dictionary): Keys (string): The glob patterns input (same as with the "globber" function") Values: (list): Files to be merged. lane_tags (List): Tags that identify samples' lane e.g. "s_1 s_2". Returns ------- No return ''' subfolder_check, subfolder = lane_merged_subfolder(working_directory) if subfolder_check == False: util.critical('Terminating script early') util.info('Beginning lane merger for files') for index_files in files_to_merge: util.info('Merging {0}'.format(index_files)) input_files = files_to_merge[index_files] output_file_name_pre = merged_filename(input_files, lane_tags, subfolder) output_file_name = os.path.join(subfolder, output_file_name_pre) with open(output_file_name, 'wb') as outfile: zcat_files = util.run(['zcat'] + input_files, stdout = subprocess.PIPE) pigz_output = util.run(['pigz', '-c'], stdin = zcat_files.stdout, stdout = outfile) zcat_files.wait() util.info('Output file {0} created'.format(output_file_name)) util.info('All lane files merged')
def glob_lister(submission_form): '''Reads in the CRUKCI_SLX_Submission.xlsx file and returns a list of the glob variables to be used by the "globber" function (The "Index" column) in the file. Parameters ---------- filepath (string / os.path): File location for glob variables. Preferably full path. Returns ------- index_list (List): Containing glob variables. ''' file_check = os.path.isfile(submission_form) if file_check == True: util.info('Submission form found at {0}'.format(submission_form)) else: error_message = 'Submission form not found, please ensure this file is in\n-->\t{0}'.format(working_directory) util.critical(error_message) util.info('Reading in index file') excel_file = pandas.read_excel(submission_form) # Find the start row to skip passed the header information util.info('Index file read') excel_file_find_start = excel_file[excel_file[0] == 'Name'] excel_file_start = int(excel_file_find_start.index[0]) + 1 samples_information = pandas.read_excel(submission_form, header = excel_file_start, usecols = ['Index']) samples_information['Index'] = samples_information['Index'].str.replace('-', '_') index_list = list(samples_information['Index']) return(index_list)
) ftp_username = input('Enter FTP server\'s username: '******'Enter FTP server\'s password: '******'working_directory') samples_information, slx_id = glob_lister( os.path.abspath(args.submission_form)) ftp_server = ftp_server_connection(ftp_username, ftp_password) downloaded_files = ftp_download_files(ftp_server, slx_id) ftp_server.quit() retries = 0 md5_check = False while md5_check == False: if retries >= 3: util.critical( '{0} retries at downloading files have failed. Please try again later' .format(retries)) md5_check = file_md5_check(downloaded_files) retries += 1 samples_csv_writer(working_directory, slx_id, samples_information) util.info('Run complete')