def check_directory(directory, check_type):
    '''Checks to see if a passed directory path exits
  
  Parameters
  ----------
  Directory (string / os.path):
    Directory to be checked
  
  Check_type (string)
    One of the argparse flags
  
  Returns
  -------
  Directory (string / os.path):
    Path returned if valid or exception if invalid
  
  '''

    directory_absolute_path = os.path.abspath(directory)

    if os.path.isdir(directory_absolute_path):
        return (directory_absolute_path)
    else:
        util.critical(
            'Invalid --{0} location. Please ensure directory exists before proceeding:\n\t{1}'
            .format(check_type, directory))
def lane_merged_subfolder(working_directory):
  '''Creates a "lane_merged" subfolder within the working directory. 
  
  Parameters
  ----------
  working_directory (string / os.path):
    Location of input files.
  
  Returns
  -------
  Boolean (True / False):
    Can user place files within created subfolder?
  subfolder (string / os.path):
    Full path of created subfolder.
  
  '''

  subfolder = os.path.join(working_directory, 'lane_merged')
  util.info('Attempting to create "lane_merged_files" folder within {0}'.format(working_directory))
  try:
    os.mkdir(subfolder)
  except(OSError):
    if os.path.isdir(subfolder):
      util.info('Directory already exists: {0}'.format(subfolder))
      return(True, subfolder)
    else:
      util.critical('Unable to create directory: {0}. Cannot proceed further'.format(subfolder))
      return(False, subfolder)
  else:
    util.info('Subfolder creation successful: {0}'.format(subfolder))
    return(True, subfolder)
def gzip_file_list(working_directory):
    '''Gets list of all .fq.gz files in the working directory (except .lostreads.fq.gz files)
  
  Parameters
  ----------
  working_directory (string / os.path):
    Directory to search for fq.gz files
  
  Returns
  -------
  fastq_gz_files(list)
    List object containing .fq.gz filenames (i.e. sample files)
  
  '''

    list_files = os.listdir(working_directory)
    fastq_gz_files = []

    for files in list_files:
        if files.endswith('.lostreads.fq.gz'):
            pass
        elif files.endswith('.fq.gz'):
            fastq_gz_files.append(files)
        else:
            continue

    if len(list_files) == 0:
        util.critical(
            'There are no gzipped fastq (FILENMAE.fq.gz) files within specified directory'
        )

    util.info('List of gzipped fastq files read into script')

    return (fastq_gz_files)
def glob_lister(submission_form):
    '''Reads in the CRUKCI_SLX_Submission.xlsx file and returns the SLX number and fq.gz file prefixes.
  
  Parameters
  ----------
  filepath (string / os.path):
    File location for glob variables. Preferably full path.
  
  Returns
  -------
  samples_information (pandas dataframe):
    Pandas dataframe containing the file index (prefix)
  
  slx_id (string):
    SLX ID from CRUKCI
  
  '''

    file_check = os.path.isfile(submission_form)
    if file_check == True:
        util.info('Submission form found at {0}'.format(submission_form))
    else:
        error_message = 'Submission form not found, please ensure this file is in\n-->\t{0}'.format(
            working_directory)
        util.critical(error_message)

    util.info('Reading in index file')
    excel_file = pandas.read_excel(
        submission_form
    )  # Find the start row to skip passed the header information
    util.info('Index file read')
    slx_id_raw = excel_file[excel_file[1] == 'SLX Identifier']
    slx_row = slx_id_raw.index[0]
    if excel_file.iloc[slx_row]['Unnamed: 2'].startswith('SLX-'):
        slx_id = excel_file.iloc[slx_row]['Unnamed: 2']
    else:
        slx_id = 'SLX-{0}'.format(excel_file.iloc[slx_row]['Unnamed: 2'])

    excel_file_find_start = excel_file[excel_file[1] == 'Name']
    excel_file_start = int(excel_file_find_start.index[0]) + 1

    samples_information = pandas.read_excel(submission_form,
                                            header=excel_file_start,
                                            usecols=['Name', 'Index'])
    samples_information['Index'] = samples_information['Index'].str.replace(
        '-', '_')

    return (samples_information, slx_id)
def check_rRNA_library(rRNA_genome_path):
    '''Checks to see if a passed rRNA library exits
  
  Parameters
  ----------
  rRNA_genome_path (string / os.path):
    Filepath to be checked
  
  Returns
  -------
  Directory (string / os.path):
    Path returned if valid or error if invalid - terminating this script
  
  '''

    directory_library = os.path.split(rRNA_genome_path)

    directory_checked = check_directory(directory_library[0], 'rRNA_library')
    if not os.path.isfile(rRNA_genome_path):
        util.critical(
            'Unable to locate rRNA library (.fa file) with specified rRNA_library path'
        )

    list_files = os.listdir(directory_checked)
    list_checked_list = []

    for_bowtie2_path = os.path.splitext(rRNA_genome_path)
    bowtie_file_reference = os.path.split(for_bowtie2_path[0])

    for files in list_files:

        if files.startswith(
                bowtie_file_reference[1]) and files.endswith('.bt2'):
            list_checked_list.append(True)
        else:
            list_checked_list.append(False)

    list_checked = set(list_checked_list)
    if True in list_checked:
        util.info('rRNA library path appears valid')
        return (rRNA_genome_path)
    else:
        util.critical(
            'Unable to locate rRNA library (bt2 files) within specified rRNA_library path: {0}'
            .format(directory_library[0]))

    return ()
def lane_merger(working_directory, files_to_merge, lane_tags):
  '''Performs the merging of the input files. zcat to read in, pigz to create
  the merged file - multi-threaded alternative to gzip for faster compression. 
  
  Parameters
  ----------
  working_directory (string / os.path)
    Location of input files.
  files_to_merge (Dictionary):
    Keys (string):
      The glob patterns input (same as with the "globber" function")
    Values: (list):
      Files to be merged.
  lane_tags (List):
    Tags that identify samples' lane e.g. "s_1 s_2".
  
  
  Returns
  -------
  No return
  
  '''

  subfolder_check, subfolder = lane_merged_subfolder(working_directory)

  if subfolder_check == False:
    util.critical('Terminating script early')

  util.info('Beginning lane merger for files')

  for index_files in files_to_merge:
    util.info('Merging {0}'.format(index_files))
    input_files = files_to_merge[index_files]
    output_file_name_pre = merged_filename(input_files, lane_tags, subfolder)
    output_file_name = os.path.join(subfolder, output_file_name_pre)
    with open(output_file_name, 'wb') as outfile:
      zcat_files = util.run(['zcat'] + input_files, stdout = subprocess.PIPE)
      pigz_output = util.run(['pigz', '-c'], stdin = zcat_files.stdout, stdout = outfile)
      zcat_files.wait()
    util.info('Output file {0} created'.format(output_file_name))
  util.info('All lane files merged')
def glob_lister(submission_form):
  '''Reads in the CRUKCI_SLX_Submission.xlsx file and returns a list of the
  glob variables to be used by the "globber" function (The "Index" column) in
  the file.
  
  Parameters
  ----------
  filepath (string / os.path):
    File location for glob variables. Preferably full path.
  
  Returns
  -------
  index_list (List):
    Containing glob variables.
  
  '''

  file_check = os.path.isfile(submission_form)
  if file_check == True:
    util.info('Submission form found at {0}'.format(submission_form))
  else:
    error_message = 'Submission form not found, please ensure this file is in\n-->\t{0}'.format(working_directory)
    util.critical(error_message)

  util.info('Reading in index file')
  excel_file = pandas.read_excel(submission_form) # Find the start row to skip passed the header information
  util.info('Index file read')
  excel_file_find_start = excel_file[excel_file[0] == 'Name']
  excel_file_start = int(excel_file_find_start.index[0]) + 1

  samples_information = pandas.read_excel(submission_form, header = excel_file_start, usecols = ['Index'])
  samples_information['Index'] = samples_information['Index'].str.replace('-', '_')

  index_list = list(samples_information['Index'])

  return(index_list)
    )

    ftp_username = input('Enter FTP server\'s username: '******'Enter FTP server\'s password: '******'working_directory')

    samples_information, slx_id = glob_lister(
        os.path.abspath(args.submission_form))
    ftp_server = ftp_server_connection(ftp_username, ftp_password)
    downloaded_files = ftp_download_files(ftp_server, slx_id)
    ftp_server.quit()

    retries = 0
    md5_check = False
    while md5_check == False:
        if retries >= 3:
            util.critical(
                '{0} retries at downloading files have failed. Please try again later'
                .format(retries))

        md5_check = file_md5_check(downloaded_files)
        retries += 1

    samples_csv_writer(working_directory, slx_id, samples_information)

    util.info('Run complete')