def fix_headers(fasta_file_name, qc_set_func, checked_qc_set_func, out_dir=None):
    '''
        Remove white spaces from the headers of a FASTA file. Fixed FASTA file
        is saved with the suffix '_h.fasta'.
    '''
    (out_path,out_basename,out_ext)=general.parse_filename(fasta_file_name)
    if out_dir is not None:
        out_path = out_dir # switch to user specified output directory
    file_with_header = out_path + '/' +  out_basename + '_h.fasta'
    broken_fasta=general.open_file(fasta_file_name)
    fixed_fasta=general.open_write_file(file_with_header)
    header_pattern = re.compile('^>.*')
    header = ''
    for line in broken_fasta:
        line=line.rstrip()
        if header_pattern.match(line):
            header = line
            header = re.sub('\s+', '_', header)
            line = header
        fixed_fasta.write(line + '\n')
    fixed_fasta.close()
    broken_fasta.close()
    remove_set = set(['header_whitespace']) # Remove qc step becuase it will be corrected
    # in the final FASTA file
    qc_set_func = qc_set_func.difference(remove_set) # skip finished repairs
    checked_qc_set_func = checked_qc_set_func.difference(remove_set) # skip finished checks
    return(file_with_header, qc_set_func, checked_qc_set_func)
def fix_new_line(file, header_whitespace=False, out_dir=None):
    """
        Strips any new line character ('\\n' or '\\r') from each line in
        file and ends each line (including the last line) with a new 
        line character ('\\n').
    """
    suffix = "_ended.fasta"
    if header_whitespace:
        suffix = "_ended_h.fasta"  # make suffix match QC steps taken
    (out_path, out_basename, out_ext) = general.parse_filename(file)
    if out_dir is not None:
        out_path = out_dir  # switch to user specified output directory
    file_with_new_line = out_path + "/" + out_basename + suffix
    broken_fasta = general.open_file(file)
    fixed_fasta = general.open_write_file(file_with_new_line)
    header_pattern = re.compile("^>.*")
    header = ""
    for line in broken_fasta:
        line = line.rstrip()
        if header_pattern.match(line):
            header = line
            header = re.sub("\s+", "_", header)
            line = header
        fixed_fasta.write(line + "\n")
    fixed_fasta.close()
    broken_fasta.close()
    return file_with_new_line
def check_wrap(fasta_file_name):
    '''
        Returns True if the none of the sequence lines in a FASTA file 
        exceed 80 characters (this should be true if the FASTA file is 
        wrapped). Returns False if one or more line of sequence
        exceeds 80 characters. Wrapped file is saved with the suffix 
        '_wrap.fasta'.

    '''
    header_pattern = re.compile('^>.*')
    infile = general.open_file(fasta_file_name)
    lengths = []
    lengths_OK = False

    for line in infile:
        line = line.rstrip()
        # Check if all but last line are equal length
        if header_pattern.match(line):
            if len(lengths) > 2: # If multiple lines remain to compare
                lengths_OK = compare_lengths(lengths)
                if not lengths_OK:
                    return False
            lengths = []
        # Append to list and check if all sequence lines are < 80
        else:
            if len(line) > 80: # exit when you hit a sequence line > 80
                return False
            seq_length = len(line)
            lengths.append(seq_length)

    # One last set to evaluate after you fall off the end of the loop
    if len(lengths) > 2:
        lengths_OK = compare_lengths(lengths)
    
    return lengths_OK
Esempio n. 4
0
def check_wrap(fasta_file_name):
    '''
        Returns True if the none of the sequence lines in a FASTA file 
        exceed 80 characters (this should be true if the FASTA file is 
        wrapped). Returns False if one or more line of sequence
        exceeds 80 characters. Wrapped file is saved with the suffix 
        '_wrap.fasta'.

    '''
    header_pattern = re.compile('^>.*')
    infile = general.open_file(fasta_file_name)
    lengths = []
    lengths_OK = False

    for line in infile:
        line = line.rstrip()
        # Check if all but last line are equal length
        if header_pattern.match(line):
            if len(lengths) > 2:  # If multiple lines remain to compare
                lengths_OK = compare_lengths(lengths)
                if not lengths_OK:
                    return False
            lengths = []
        # Append to list and check if all sequence lines are < 80
        else:
            if len(line) > 80:  # exit when you hit a sequence line > 80
                return False
            seq_length = len(line)
            lengths.append(seq_length)

    # One last set to evaluate after you fall off the end of the loop
    if len(lengths) > 2:
        lengths_OK = compare_lengths(lengths)

    return lengths_OK
Esempio n. 5
0
def fix_headers(fasta_file_name,
                qc_set_func,
                checked_qc_set_func,
                out_dir=None):
    '''
        Remove white spaces from the headers of a FASTA file. Fixed FASTA file
        is saved with the suffix '_h.fasta'.
    '''
    (out_path, out_basename, out_ext) = general.parse_filename(fasta_file_name)
    if out_dir is not None:
        out_path = out_dir  # switch to user specified output directory
    file_with_header = out_path + '/' + out_basename + '_h.fasta'
    broken_fasta = general.open_file(fasta_file_name)
    fixed_fasta = general.open_write_file(file_with_header)
    header_pattern = re.compile('^>.*')
    header = ''
    for line in broken_fasta:
        line = line.rstrip()
        if header_pattern.match(line):
            header = line
            header = re.sub('\s+', '_', header)
            line = header
        fixed_fasta.write(line + '\n')
    fixed_fasta.close()
    broken_fasta.close()
    remove_set = set(['header_whitespace'
                      ])  # Remove qc step becuase it will be corrected
    # in the final FASTA file
    qc_set_func = qc_set_func.difference(remove_set)  # skip finished repairs
    checked_qc_set_func = checked_qc_set_func.difference(
        remove_set)  # skip finished checks
    return (file_with_header, qc_set_func, checked_qc_set_func)
def parse_file(read_list_file, single):
    '''
        Parse input read_list file. Check that it has the correct number of tab
        delimited columns and get list of forwards and/or reverses.
    '''
    forwards=[]
    reverses=[]
    input = general.open_file(read_list_file)
    if single:
        for line in input:
            forwards = (re.split(',', line))
        return(forwards,False)
    else:
        try:
            for line in input:
                line = (line.rstrip())
                if line:
                    (forward,reverse) = (re.split('\s+', line))
                    forwards = (re.split(',', forward))
                    reverses = (re.split(',', reverse))
            if not len(forwards) == len(reverses):
                log.error('Exiting because the number of forward read FASTQ files does not equal the number of reverse read FASTQ files. This may indicate that your read list is not properly formatted. It could indicate that you should use the -s --single flag to for single end Illumina reads or make sure to separate you comma separated list of forward and reverse reads with a single tab in your plain text read_list file.')
                sys.exit(0)
            else:
                return(forwards,reverses)
        except ValueError as e:
            log.error('"%(e)s... Use the -s --single flag to indicate single end Illumina reads or make sure to separate you comma separated list of foward and reverse reads with a single tab in your plain text read_list file."' % locals())
            sys.exit(0)
    input.close()
def check_headers(fasta_file_name):
    '''
        Check if FASTA headers contain white spaces that break Trimmomatic and 
        some other bioinfo tools. Return True if header has spaces. Returns 
        False if header has no spaces.
    '''
    header_pattern = re.compile('^>.*')
    infile = general.open_file(fasta_file_name)
    for line in infile:
        line = line.rstrip()
        if header_pattern.match(line):
            if re.match('.*\s.*', line):
                return(False)
    return(True)
def check_header_pattern(file):
    '''
        Check if FASTA file begins with a '>'. Returns True if the first line
        begins with a '>'. Returns False if the file starts with any other                                                     
        character.
    '''
    header_pattern = re.compile('^>.*')
    infile = general.open_file(file)
    first_line = infile.readline()
    infile.close()
    if header_pattern.match(first_line):
        return(True)
    else:
        return(False)
def check_header_pattern(file):
    """
        Check if FASTA file begins with a '>'. Returns True if the first line
        is begins with a '>'. Returns False if the file starts with any other 
        character.
    """
    header_pattern = re.compile("^>.*")
    infile = general.open_file(file)
    first_line = infile.readline()
    infile.close()
    if header_pattern.match(first_line):
        return True
    else:
        return False
Esempio n. 10
0
def fix_wrap(fasta_file_name, qc_set_func, checked_qc_set_func, out_dir=None):
    '''
        Wraps text in a FASTA file so that no line of sequence has more
        than 60 bases. Wrapped file is saved with the suffix '_wrap.fasta'.
    '''
    suffix = '_wrap.fasta'
    if 'header_whitespace' in qc_set_func:
        suffix = '_wrap_h.fasta'
    (out_path, out_basename, out_ext) = general.parse_filename(fasta_file_name)
    if out_dir is not None:
        out_path = out_dir  # switch to user specified output directory
    file_with_wrapping = out_path + '/' + out_basename + suffix
    fixed_fasta = general.open_write_file(file_with_wrapping)
    header_pattern = re.compile('^>.*')
    infile = general.open_file(fasta_file_name)
    dna = ''
    for line in infile:
        line = line.rstrip()
        if header_pattern.match(line):  # Print headers immediately to new file
            header = line
            if 'header_whitespace' in qc_set_func:
                header = re.sub('\s+', '_', header)
            if dna:
                fixed_fasta.write(dna + '\n')  # print remaining sequence
                # before header
            fixed_fasta.write(header + '\n')
            dna = ''  # Reset DNA
        else:  # if the line is sequence data dump sequence as fast as it is
            # long enough to wrap
            dna = dna + line
            while len(dna) > 59:  # Wrap sequence lines after
                # 60 bases
                wrap_line = dna[0:60]
                dna = dna[60:len(dna)]
                fixed_fasta.write(wrap_line + '\n')
    # Catch the last record
    else:  # For end of file
        if dna:
            fixed_fasta.write(dna + '\n')  # print remaining sequence
        # before header
    fixed_fasta.close()
    infile.close()
    remove_set = set(['wrap', 'new_line',
                      'header_whitespace'])  # Remove all three qc steps
    # becuase all will be corrected in the final FASTA file
    qc_set_func = qc_set_func.difference(remove_set)  # skip finished repairs
    checked_remove_set = set(['wrap'])
    checked_qc_set_func = checked_qc_set_func.difference(
        checked_remove_set)  # skip finished checks
    return (file_with_wrapping, qc_set_func, checked_qc_set_func)
Esempio n. 11
0
def check_headers(file):
    """
        Check if FASTA headers contain white spaces that break Trimmomatic and 
        some other bioinfo tools. Return True if header has spaces. Returns 
        False if header has no spaces.
    """
    header_pattern = re.compile("^>.*")
    infile = general.open_file(file)
    for line in infile:
        line = line.rstrip()
        if header_pattern.match(line):
            if re.match(".*\s.*", line):
                return False
    return True
Esempio n. 12
0
def check_header_pattern(fasta_file_name):
    '''
        Check if FASTA file begins with a '>'. Returns True if the first line
        is begins with a '>'. Returns False if the file starts with any other 
        character.
    '''
    header_pattern = re.compile('^>.*')
    infile = general.open_file(fasta_file_name)
    first_line = infile.readline()
    infile.close()
    if header_pattern.match(first_line):
        return (True)
    else:
        return (False)
Esempio n. 13
0
def check_headers(fasta_file_name):
    '''
        Check if FASTA headers contain white spaces that break Trimmomatic and 
        some other bioinfo tools. Return True if header has spaces. Returns 
        False if header has no spaces.
    '''
    header_pattern = re.compile('^>.*')
    infile = general.open_file(fasta_file_name)
    for line in infile:
        line = line.rstrip()
        if header_pattern.match(line):
            if re.match('.*\s.*', line):
                return (False)
    return (True)
def fix_wrap(fasta_file_name, qc_set_func, checked_qc_set_func, out_dir=None):
    '''
        Wraps text in a FASTA file so that no line of sequence has more
        than 60 bases. Wrapped file is saved with the suffix '_wrap.fasta'.
    '''
    suffix = '_wrap.fasta'
    if 'header_whitespace' in qc_set_func:
        suffix = '_wrap_h.fasta'
    (out_path,out_basename,out_ext)=general.parse_filename(fasta_file_name)
    if out_dir is not None:
        out_path = out_dir # switch to user specified output directory
    file_with_wrapping = out_path + '/' + out_basename + suffix
    fixed_fasta=general.open_write_file(file_with_wrapping)
    header_pattern = re.compile('^>.*')
    infile = general.open_file(fasta_file_name)
    dna    = ''
    for line in infile:
        line = line.rstrip()
        if header_pattern.match(line): # Print headers immediately to new file
            header = line
            if 'header_whitespace' in qc_set_func:
                header = re.sub('\s+', '_', header)
            if dna:
                fixed_fasta.write(dna + '\n') # print remaining sequence
                # before header
            fixed_fasta.write(header + '\n')
            dna = '' # Reset DNA
        else: # if the line is sequence data dump sequence as fast as it is
            # long enough to wrap
            dna = dna + line
            while len(dna) > 59: # Wrap sequence lines after
                # 60 bases
                wrap_line = dna[0:60]
                dna = dna[60:len(dna)]
                fixed_fasta.write(wrap_line + '\n')
    # Catch the last record
    else: # For end of file
        if dna:
            fixed_fasta.write(dna + '\n') # print remaining sequence
        # before header
    fixed_fasta.close()
    infile.close()
    remove_set = set(['wrap','new_line','header_whitespace']) # Remove all three qc steps
    # becuase all will be corrected in the final FASTA file
    qc_set_func = qc_set_func.difference(remove_set) # skip finished repairs
    checked_remove_set = set(['wrap'])
    checked_qc_set_func = checked_qc_set_func.difference(checked_remove_set) # skip finished checks
    return(file_with_wrapping, qc_set_func, checked_qc_set_func)
Esempio n. 15
0
def check_new_line(file):
    """
        Returns True if the last line in a FASTA file ends in the 
        standard new line character ('\\n'). Returns False if not.
        Test also fails if the sequence lines ends in the less common
        '\\r' character.
    """
    infile = general.open_file(file)
    last_char = ""
    for line in infile:
        last_char = line[-1]  # grab the last character
    infile.close()
    if last_char == "\n":  # test the final last character
        return True
    else:
        return False
def check_iupac(fasta_file_name):
    '''
        Check if FASTA file contains non-IUPAC characters in sequence lines. 
        Returns false if non-IUPAC characters are found and True if non are 
        found.
    '''
    iupac_set = set(['a' , 'b' , 'c' , 'd' , 'e' , 'f' , 'g' , 'h' , 'i' , 'k' , 'l' , 'm' , 'n' , 'o' , 'p' , 'q' , 'r' , 's' , 't' , 'u' , 'v' , 'w' , 'x' , 'y' , 'A' , 'B' , 'C' , 'D' , 'E' , 'F' , 'G' , 'H' , 'I' , 'K' , 'L' , 'M' , 'N' , 'O', 'P' , 'Q' , 'R' , 'S' , 'T' , 'U' , 'V' , 'W' , 'X' , 'Y' , '-' , '*'])
    header_pattern = re.compile('^>.*')
    infile = general.open_file(fasta_file_name)
    for line in infile:
        if not header_pattern.match(line):
            line = line.rstrip()
            for char in line:
                if not char in iupac_set: # check each character against IUPAC set
                    log.error('\tError: %(char)s in sequence line' % locals())
                    return(False)
    return(True)
def fix_wrap(file, header_whitespace=False, out_dir=None):
    '''
        Wraps text in a FASTA file so that no line of sequence has more 
        than 60 bases. Wrapped file is saved with the suffix '_wrap.fasta'.
    '''
    suffix = '_wrap.fasta'
    if header_whitespace:
        suffix = '_wrap_h.fasta'
    (out_path,out_basename,out_ext)=general.parse_filename(file)
    if out_dir is not None:
        out_path = out_dir # switch to user specified output directory
    file_with_wrapping = out_path + '/' + out_basename + suffix
    fixed_fasta=general.open_write_file(file_with_wrapping)
    header_pattern = re.compile('^>.*')
    infile = general.open_file(file)
    header = '';
    dna    = '';
    records = []
    for line in infile:
        line = line.rstrip()
        if header_pattern.match(line):
            if dna:
                records.append([header,dna])
                dna = ''
            header = line
            if header_whitespace:
                header = re.sub('\s+', '_', header)
        else:
            dna = dna + line

    # Catch the last record
    if dna and header:
        records.append([header,dna])

    for record in records:
        header, dna = record
        fixed_fasta.write(header + '\n')
        wrap = textwrap.fill(dna,60) # Wrap sequence lines after 60 bases
        fixed_fasta.write(wrap + '\n')

    fixed_fasta.close()
    infile.close()

    return(file_with_wrapping)
Esempio n. 18
0
def fix_wrap(file, header_whitespace=False, out_dir=None):
    """
        Wraps text in a FASTA file so that no line of sequence has more 
        than 60 bases. Wrapped file is saved with the suffix '_wrap.fasta'.
    """
    suffix = "_wrap.fasta"
    if header_whitespace:
        suffix = "_wrap_h.fasta"
    (out_path, out_basename, out_ext) = general.parse_filename(file)
    if out_dir is not None:
        out_path = out_dir  # switch to user specified output directory
    file_with_wrapping = out_path + "/" + out_basename + suffix
    fixed_fasta = general.open_write_file(file_with_wrapping)
    header_pattern = re.compile("^>.*")
    infile = general.open_file(file)
    dna = ""
    header = ""
    for line in infile:
        line = line.rstrip()
        if header_pattern.match(line):
            if not dna == "":  # skip the first (empty record)
                fixed_fasta.write(header + "\n")
                wrap = textwrap.fill(dna, 60)  # Wrap sequence lines after
                # 60 bases
                fixed_fasta.write(wrap + "\n")
            header = line
            if header_whitespace:
                header = re.sub("\s+", "_", header)  # Gets rid of
                # whitespace in the headers
            new_dna = next(infile)
            new_dna = new_dna.rstrip()
            dna = new_dna
        else:
            dna = dna + line
    else:  # For end of file
        fixed_fasta.write(header + "\n")
        wrap = textwrap.fill(dna, 60)  # Wrap sequence lines after
        # 60 bases
        fixed_fasta.write(wrap + "\n")
    fixed_fasta.close()
    infile.close()
    return file_with_wrapping
Esempio n. 19
0
def check_wrap(file):
    """
        Returns True if the none of the sequence lines in a FASTA file 
        exceed 80 characters (this should be true if the FASTA file is 
        wrapped). Returns False if one or more line of sequence
        exceeds 80 characters. Wrapped file is saved with the suffix 
        '_wrap.fasta'.

    """
    header_pattern = re.compile("^>.*")
    infile = general.open_file(file)
    lengths = []
    wrap_length = None
    for line in infile:
        line = line.rstrip()
        # Check if all but last line are equal length
        if header_pattern.match(line):
            if len(lengths) > 2:  # If multiple lines remain to compare
                if wrap_length is None:
                    wrap_length = lengths[0]  # initialize wrapping length
                lengths.pop()  # Remove the last sequence line
                for seq_line in lengths:
                    if seq_line != wrap_length:
                        return False  # Exit when you hit mismatched wrapped lines
            lengths = []
        # Check if all sequence lines are < 80
        if not header_pattern.match(line):
            if len(line) > 80:  # exit when you hit a sequence line > 80
                return False
            seq_length = len(line)
            lengths.append(seq_length)
    else:  # For end of file
        if len(lengths) > 2:  # If multiple lines remain to compare
            if wrap_length is None:
                wrap_length = lengths[0]  # initialize wrapping length
                lengths.pop()  # Remove the last sequence line
                for seq_line in lengths:
                    if seq_line != wrap_length:
                        return False  # Exit when you hit mismatched wrapped lines
    return True
Esempio n. 20
0
def fix_new_line(fasta_file_name,
                 qc_set_func,
                 checked_qc_set_func,
                 out_dir=None):
    '''
        Strips any new line character ('\\n' or '\\r') from each line in
        file and ends each line (including the last line) with a new 
        line character ('\\n').
    '''
    suffix = '_ended.fasta'
    if 'header_whitespace' in qc_set_func:
        suffix = '_ended_h.fasta'  # make suffix match QC steps taken
    (out_path, out_basename, out_ext) = general.parse_filename(fasta_file_name)
    if out_dir is not None:
        out_path = out_dir  # switch to user specified output directory
    file_with_new_line = out_path + '/' + out_basename + suffix
    if sys.version_info > (3, 0):
        broken_fasta = general.open_file(fasta_file_name)
    else:
        broken_fasta = open(fasta_file_name, 'rU')
    fixed_fasta = general.open_write_file(file_with_new_line)
    header_pattern = re.compile('^>.*')
    header = ''
    for line in broken_fasta:
        line = line.rstrip()
        if header_pattern.match(line):
            header = line
            if 'header_whitespace' in qc_set_func:
                header = re.sub('\s+', '_', header)
            line = header
        fixed_fasta.write(line + '\n')
    fixed_fasta.close()
    broken_fasta.close()
    remove_set = set(['new_line', 'header_whitespace'])  # Remove both qc steps
    # becuase they will be corrected in the final FASTA file
    qc_set_func = qc_set_func.difference(remove_set)  # skip finished repairs
    checked_remove_set = set(['new_line'])
    checked_qc_set_func = checked_qc_set_func.difference(
        checked_remove_set)  # skip finished checks
    return (file_with_new_line, qc_set_func, checked_qc_set_func)
def get_count(fasta_file_name):
    '''
        Takes a FASTA file path and returns the number of lines and the
        number of sequences.
    '''
    newline_pattern = re.compile('.*\n')
    header_pattern = re.compile('^>.*')
    line_count = 0
    header_count = 0
    if sys.version_info > (3, 0): # Open without automatically converting
        # newlines to standard Unix newlines for python3.3+
        fasta_file = open(fasta_file_name, 'r',  newline='')
    else:
        fasta_file = general.open_file(fasta_file_name)
    for line in fasta_file:
        if newline_pattern.match(line):
            line_count = line_count + 1
        line = line.rstrip()
        if header_pattern.match(line):
            header_count = header_count + 1
    fasta_file.close()
    return(line_count,header_count)
def check_unique(fasta_file_name):
    '''
        Check if FASTA headers have unique first words. Returns True if header
        first words are unique. Returns False if header first words are not and 
        cannot be made unique automatically.
        '''
    first_word_set = set()
    header_pattern = re.compile('^>.*')
    infile = general.open_file(fasta_file_name)
    for line in infile:
        line = line.rstrip()
        if header_pattern.match(line):
            if re.match('^>(\S+)',line): # grab first word in description
                word = re.match('^>(\S+)',line) # grab first word in description
                current_word = word.group(1)
                if not current_word in first_word_set:
                    first_word_set.add(current_word)
                else:
                    return(False) # you have seen this first word before!
            else:
                return(False) # Blank headers can't pass a test for uniqueness
    return(True)
Esempio n. 23
0
def check_iupac(fasta_file_name):
    '''
        Check if FASTA file contains non-IUPAC characters in sequence lines. 
        Returns false if non-IUPAC characters are found and True if non are 
        found.
    '''
    iupac_set = set([
        'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o',
        'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'A', 'B', 'C', 'D',
        'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S',
        'T', 'U', 'V', 'W', 'X', 'Y', '-', '*'
    ])
    header_pattern = re.compile('^>.*')
    infile = general.open_file(fasta_file_name)
    for line in infile:
        if not header_pattern.match(line):
            line = line.rstrip()
            for char in line:
                if not char in iupac_set:  # check each character against IUPAC set
                    log.error('\tError: %(char)s in sequence line' % locals())
                    return (False)
    return (True)
Esempio n. 24
0
def fix_headers(file, out_dir=None):
    """
        Remove white spaces that break Trimmomatic and some other bioinfo tools 
        from the headers of a FASTA file. Fixed FASTA file is saved with the 
        suffix '_h.fasta'.
    """
    (out_path, out_basename, out_ext) = general.parse_filename(file)
    if out_dir is not None:
        out_path = out_dir  # switch to user specified output directory
    file_with_header = out_path + "/" + out_basename + "_h.fasta"
    broken_fasta = general.open_file(file)
    fixed_fasta = general.open_write_file(file_with_header)
    header_pattern = re.compile("^>.*")
    header = ""
    for line in broken_fasta:
        line = line.rstrip()
        if header_pattern.match(line):
            header = line
            header = re.sub("\s+", "_", header)
            line = header
        fixed_fasta.write(line + "\n")
    fixed_fasta.close()
    broken_fasta.close()
    return file_with_header
Esempio n. 25
0
def check_unique(fasta_file_name):
    '''
        Check if FASTA headers have unique first words. Returns True if header
        first words are unique. Returns False if header first words are not and 
        cannot be made unique automatically.
        '''
    first_word_set = set()
    header_pattern = re.compile('^>.*')
    infile = general.open_file(fasta_file_name)
    for line in infile:
        line = line.rstrip()
        if header_pattern.match(line):
            if re.match('^>(\S+)', line):  # grab first word in description
                word = re.match('^>(\S+)',
                                line)  # grab first word in description
                current_word = word.group(1)
                if not current_word in first_word_set:
                    first_word_set.add(current_word)
                else:
                    return (False)  # you have seen this first word before!
            else:
                return (False
                        )  # Blank headers can't pass a test for uniqueness
    return (True)
def fix_new_line(fasta_file_name, qc_set_func, checked_qc_set_func, out_dir=None):
    '''
        Strips any new line character ('\\n' or '\\r') from each line in
        file and ends each line (including the last line) with a new 
        line character ('\\n').
    '''
    suffix = '_ended.fasta'
    if 'header_whitespace' in qc_set_func:
        suffix = '_ended_h.fasta' # make suffix match QC steps taken
    (out_path,out_basename,out_ext)=general.parse_filename(fasta_file_name)
    if out_dir is not None:
        out_path = out_dir # switch to user specified output directory
    file_with_new_line = out_path + '/' +  out_basename + suffix
    if sys.version_info > (3, 0):
        broken_fasta=general.open_file(fasta_file_name)
    else:
        broken_fasta = open(fasta_file_name, 'rU')
    fixed_fasta=general.open_write_file(file_with_new_line)
    header_pattern = re.compile('^>.*')
    header = ''
    for line in broken_fasta:
        line=line.rstrip()
        if header_pattern.match(line):
            header = line
            if 'header_whitespace' in qc_set_func:
                header = re.sub('\s+', '_', header)
            line = header
        fixed_fasta.write(line + '\n')
    fixed_fasta.close()
    broken_fasta.close()
    remove_set = set(['new_line','header_whitespace']) # Remove both qc steps
    # becuase they will be corrected in the final FASTA file
    qc_set_func = qc_set_func.difference(remove_set) # skip finished repairs
    checked_remove_set = set(['new_line'])
    checked_qc_set_func = checked_qc_set_func.difference(checked_remove_set) # skip finished checks
    return(file_with_new_line, qc_set_func, checked_qc_set_func)
def fix_headers(file, out_dir=None):
    '''
        Remove white spaces that break Trimmomatic and some other bioinfo tools 
        from the headers of a FASTA file. Fixed FASTA file is saved with the 
        suffix '_h.fasta'.
    '''
    (out_path,out_basename,out_ext)=general.parse_filename(file)
    if out_dir is not None:
        out_path = out_dir # switch to user specified output directory
    file_with_header = out_path + '/' +  out_basename + '_h.fasta'
    broken_fasta=general.open_file(file)
    fixed_fasta=general.open_write_file(file_with_header)
    header_pattern = re.compile('^>.*')
    header = ''
    for line in broken_fasta:
        line=line.rstrip()
        if header_pattern.match(line):
            header = line
            header = re.sub('\s+', '_', header)
            line = header
        fixed_fasta.write(line + '\n')
    fixed_fasta.close()
    broken_fasta.close()
    return(file_with_header)
def main():
    '''
        Run full script as opposed to individual script functions.
    '''
    ######################################################################
    ############        Get commandline arguments             ############
    ######################################################################
    parser = argparse.ArgumentParser(
    description='DESCRIPTION: Summarize counts of all four DNA bases. \
                                     Command-line options that may be omitted \
                                     (i.e. are NOT required) are shown in \
                                     square brackets.')
    parser.add_argument('-v', '--verbose', action='store_true',
    dest='verbose', help='Runs reporting status updates',
    default=True)
    parser.add_argument('-q', '--quiet', action='store_false',
                     dest='verbose', help='Does not report status updates')
    parser.add_argument('-c', '--colorized',
                     help='Colorizes log reports. Use only if printing \
                     output to screen.',action='store_true',dest='colorized')
    parser.add_argument('-r', '--read_list', dest='read_list',
                        help='This is the the full path (path and filename) of \
                        the user provided list of read files. The file should \
                        be tab separated with the first read file, then the \
                        second read file (see example_read_list_PE.tab). If a \
                        sample has multiple fastq files for R1 and R2 separate \
                        these with commas (see example_read_list_PE_multi.tab).\
                        For single end reads each line should be a path \
                        to a fastq file. For single end reads each line should \
                        be a path to a fastq file (see example_read_list_SE.tab\
                        )', required=True)
    parser.add_argument('-p', '--project', dest='project',
                     help='The project id. This will be used to name output \
                        (default=project).', default='project', required=False)
    parser.add_argument('-a', '--adapter', dest='adapter',
                        help='The adapter fasta file. This will be used to \
                        clean reads',default='/homes/bioinfo_software/Trimmomatic-0.33/adapters/TruSeq3-PE-2.fa', required=False)
    parser.add_argument('-s', '--single_end', action='store_true', dest='single',
                        help='If your reads are single end use this flag. \
                        Without it the script assumes reads are paired end. \
                        Also skip the second column (the reverse fastq files) \
                        when making your read list', required=False,
                        default=False)
    parser.add_argument('-x', '--convert_header', action='store_true',
                        dest='convert_header', help='If the illumina headers \
                        do not end in /1 or /2 use this parameter to indicat \
                        that headers need to be converted. Check your headers \
                        by typing "head FASTA_FULL_PATH" and read more about \
                        illumina headers at \
                        http://en.wikipedia.org/wiki/Fastq#Illumina_sequence_identifiers.',
                        default=False, required=False)
    parser.add_argument('-m', '--min_read_length', dest='min_read_length',
                        help='The minimum read length in bp. (Default = 90).',
                        required=False, default=90)
    parser.add_argument('-o', '--out', dest='out',
                        help='Output directory (Default=$HOME)', required=False,
                        default='~')
    parser.add_argument('-d', '--dna', dest='sequence', help='DNA sequence to \
                        summarize', default='TATGAAGGGCGATGAATGCTATCTGTCCTGTAGAATTATAGAATCGACTACGTTGGGGAACTAATGGACCAGACAACTCGCTTTGACTGACGTAGACGGCGTGTTGT',
                        required=False)
    args = parser.parse_args()
    if args.colorized:
        import Colorer
    if args.verbose:
        doc()
        log.basicConfig(format='%(levelname)s:  %(message)s', level=log.DEBUG)
        log.info('Output is verbose. Run with -q, --quiet flag to suppress full output.')
    else:
        log.basicConfig(format='%(levelname)s: %(message)s')
    ######################################################################
    ############      Call custom functions with arguments     ###########
    ######################################################################
    # Get list of read FASTQ files
    #######################################
    print(args.read_list, args.single, args.min_read_length)
    (forwards,reverses) = trimmomatic_template.parse_file(args.read_list,
                                                          args.single)
    #######################################
    # Sanity check read FASTQ files
    #######################################
    index = 0
    for fastq in forwards:
        f_opened_file=general.open_file(forwards[index])
        f_opened_file.close()
        forwards[index] = general.convert_to_full(forwards[index])
        if not args.single:
            r_opened_file=general.open_file(reverses[index])
            r_opened_file.close()
            reverses[index] = general.convert_to_full(reverses[index])
        index += 1
    #######################################
    # Make output directory
    #######################################
    (out_path,out_basename,out_ext)=general.parse_filename(args.out)
    out_dir=out_path + '/' + out_basename
    general.path_check(out_dir) # Sanity check directory
    out_dir= out_dir + '/' + args.project # final out directory is 'project_id'
    general.mk_out_sub_directory(out_dir)
    general.mk_out_sub_directory(out_dir + '/scripts')
    general.mk_out_sub_directory(out_dir + '/qsubs')
    #######################################
    # Write trimmomatic script
    #######################################
    convert=' | awk \'{if (NR % 4 == 1) {split($1, arr, \":\"); printf \"%s_%s:%s:%s:%s:%s#0/%s\\n\", arr[1], arr[3], arr[4], arr[5], arr[6], arr[7], substr($2, 1, 1), $0} else if (NR % 4 == 3){print \"+\"} else {print $0} }\' > '
    qsub_script = general.open_write_file(out_dir + '/qsubs/qsub_trimmomatic.sh')
    qsub_script.write('#!/bin/bash\n')
    index=0
    args.adapter = fasta_o_matic.run_steps(args.adapter,['wrap', 'new_line','header_whitespace'])
    for fastq in forwards:
        (f_path,f_basename,f_ext)=general.parse_filename(forwards[index])
        qsub_script.write('qsub -l mem=4G,h_rt=6:00:00 -pe single 16 '+ out_dir
                          + '/scripts/run_trimmomatic_' + f_basename + '.sh\n' )
        if not args.single:
            (r_path,r_basename,r_ext)=general.parse_filename(reverses[index])
        trim_script = general.open_write_file(out_dir
                                              + '/scripts/run_trimmomatic_'
                                              + f_basename + '.sh')
        trim_script.write('#!/bin/bash\n')
        # Convert headers
        if args.convert_header:
            trim_script.write('# Convert headers:\n')
            new_forward_fastq = out_dir + '/' + f_basename + '_h.fastq'
            trim_script.write('cat ' + forwards[index] + convert
                              + new_forward_fastq + '\n')
            forwards[index] = new_forward_fastq
            if not args.single:
                new_reverse_fastq = out_dir + '/' + r_basename + '_h.fastq'
                trim_script.write('cat ' + reverses[index] + convert
                                  + new_reverse_fastq + '\n')
                reverses[index] = new_reverse_fastq
        # Trim sequences
        trim_script.write('# Clean reads:\n')
        if not args.single:
            trim_script.write(trimmomatic_template.trim_template(
                                                                 forwards[index],
                                                                 reverses[index],
                                                                 args.adapter,
                                                                 out_dir))
        else:
            trim_script.write(trimmomatic_template.trim_template_single(forwards[index]))
            # Section in progress... (Remember to point to a SE adapter fasta file
            # by default)
        trim_script.close()
        index += 1
    qsub_script.close()
Esempio n. 29
0
def main():
    '''
        Run full script as opposed to individual script functions.
    '''
    ######################################################################
    ############        Get commandline arguments             ############
    ######################################################################
    parser = argparse.ArgumentParser(
        description='DESCRIPTION: Summarize counts of all four DNA bases. \
                                     Command-line options that may be omitted \
                                     (i.e. are NOT required) are shown in \
                                     square brackets.')
    parser.add_argument('-v',
                        '--verbose',
                        action='store_true',
                        dest='verbose',
                        help='Runs reporting status updates',
                        default=True)
    parser.add_argument('-q',
                        '--quiet',
                        action='store_false',
                        dest='verbose',
                        help='Does not report status updates')
    parser.add_argument('-c',
                        '--colorized',
                        help='Colorizes log reports. Use only if printing \
                     output to screen.',
                        action='store_true',
                        dest='colorized')
    parser.add_argument(
        '-r',
        '--read_list',
        dest='read_list',
        help='This is the the full path (path and filename) of \
                        the user provided list of read files. The file should \
                        be tab separated with the first read file, then the \
                        second read file (see example_read_list_PE.tab). If a \
                        sample has multiple fastq files for R1 and R2 separate \
                        these with commas (see example_read_list_PE_multi.tab).\
                        For single end reads each line should be a path \
                        to a fastq file. For single end reads each line should \
                        be a path to a fastq file (see example_read_list_SE.tab\
                        )',
        required=True)
    parser.add_argument(
        '-p',
        '--project',
        dest='project',
        help='The project id. This will be used to name output \
                        (default=project).',
        default='project',
        required=False)
    parser.add_argument(
        '-a',
        '--adapter',
        dest='adapter',
        help='The adapter fasta file. This will be used to \
                        clean reads',
        default=
        '/homes/bioinfo_software/Trimmomatic-0.33/adapters/TruSeq3-PE-2.fa',
        required=False)
    parser.add_argument('-s',
                        '--single_end',
                        action='store_true',
                        dest='single',
                        help='If your reads are single end use this flag. \
                        Without it the script assumes reads are paired end. \
                        Also skip the second column (the reverse fastq files) \
                        when making your read list',
                        required=False,
                        default=False)
    parser.add_argument('-x',
                        '--convert_header',
                        action='store_true',
                        dest='convert_header',
                        help='If the illumina headers \
                        do not end in /1 or /2 use this parameter to indicat \
                        that headers need to be converted. Check your headers \
                        by typing "head FASTA_FULL_PATH" and read more about \
                        illumina headers at \
                        http://en.wikipedia.org/wiki/Fastq#Illumina_sequence_identifiers.',
                        default=False,
                        required=False)
    parser.add_argument('-m',
                        '--min_read_length',
                        dest='min_read_length',
                        help='The minimum read length in bp. (Default = 90).',
                        required=False,
                        default=90)
    parser.add_argument('-o',
                        '--out',
                        dest='out',
                        help='Output directory (Default=$HOME)',
                        required=False,
                        default='~')
    parser.add_argument(
        '-d',
        '--dna',
        dest='sequence',
        help='DNA sequence to \
                        summarize',
        default=
        'TATGAAGGGCGATGAATGCTATCTGTCCTGTAGAATTATAGAATCGACTACGTTGGGGAACTAATGGACCAGACAACTCGCTTTGACTGACGTAGACGGCGTGTTGT',
        required=False)
    args = parser.parse_args()
    if args.colorized:
        import Colorer
    if args.verbose:
        doc()
        log.basicConfig(format='%(levelname)s:  %(message)s', level=log.DEBUG)
        log.info(
            'Output is verbose. Run with -q, --quiet flag to suppress full output.'
        )
    else:
        log.basicConfig(format='%(levelname)s: %(message)s')
    ######################################################################
    ############      Call custom functions with arguments     ###########
    ######################################################################
    # Get list of read FASTQ files
    #######################################
    print(args.read_list, args.single, args.min_read_length)
    (forwards,
     reverses) = trimmomatic_template.parse_file(args.read_list, args.single)
    #######################################
    # Sanity check read FASTQ files
    #######################################
    index = 0
    for fastq in forwards:
        f_opened_file = general.open_file(forwards[index])
        f_opened_file.close()
        forwards[index] = general.convert_to_full(forwards[index])
        if not args.single:
            r_opened_file = general.open_file(reverses[index])
            r_opened_file.close()
            reverses[index] = general.convert_to_full(reverses[index])
        index += 1
    #######################################
    # Make output directory
    #######################################
    (out_path, out_basename, out_ext) = general.parse_filename(args.out)
    out_dir = out_path + '/' + out_basename
    general.path_check(out_dir)  # Sanity check directory
    out_dir = out_dir + '/' + args.project  # final out directory is 'project_id'
    general.mk_out_sub_directory(out_dir)
    general.mk_out_sub_directory(out_dir + '/scripts')
    general.mk_out_sub_directory(out_dir + '/qsubs')
    #######################################
    # Write trimmomatic script
    #######################################
    convert = ' | awk \'{if (NR % 4 == 1) {split($1, arr, \":\"); printf \"%s_%s:%s:%s:%s:%s#0/%s\\n\", arr[1], arr[3], arr[4], arr[5], arr[6], arr[7], substr($2, 1, 1), $0} else if (NR % 4 == 3){print \"+\"} else {print $0} }\' > '
    qsub_script = general.open_write_file(out_dir +
                                          '/qsubs/qsub_trimmomatic.sh')
    qsub_script.write('#!/bin/bash\n')
    index = 0
    args.adapter = fasta_o_matic.run_steps(
        args.adapter, ['wrap', 'new_line', 'header_whitespace'])
    for fastq in forwards:
        (f_path, f_basename, f_ext) = general.parse_filename(forwards[index])
        qsub_script.write('qsub -l mem=4G,h_rt=6:00:00 -pe single 16 ' +
                          out_dir + '/scripts/run_trimmomatic_' + f_basename +
                          '.sh\n')
        if not args.single:
            (r_path, r_basename,
             r_ext) = general.parse_filename(reverses[index])
        trim_script = general.open_write_file(out_dir +
                                              '/scripts/run_trimmomatic_' +
                                              f_basename + '.sh')
        trim_script.write('#!/bin/bash\n')
        # Convert headers
        if args.convert_header:
            trim_script.write('# Convert headers:\n')
            new_forward_fastq = out_dir + '/' + f_basename + '_h.fastq'
            trim_script.write('cat ' + forwards[index] + convert +
                              new_forward_fastq + '\n')
            forwards[index] = new_forward_fastq
            if not args.single:
                new_reverse_fastq = out_dir + '/' + r_basename + '_h.fastq'
                trim_script.write('cat ' + reverses[index] + convert +
                                  new_reverse_fastq + '\n')
                reverses[index] = new_reverse_fastq
        # Trim sequences
        trim_script.write('# Clean reads:\n')
        if not args.single:
            trim_script.write(
                trimmomatic_template.trim_template(forwards[index],
                                                   reverses[index],
                                                   args.adapter, out_dir))
        else:
            trim_script.write(
                trimmomatic_template.trim_template_single(forwards[index]))
            # Section in progress... (Remember to point to a SE adapter fasta file
            # by default)
        trim_script.close()
        index += 1
    qsub_script.close()