Python open_write_file Beispiele, general.open_write_file Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: fasta_clean.py Projekt: aeron15/read-cleaning-format-conversion

def fix_fasta_file(file, out_dir=None):
    '''
        Passes a file through biopython SeqIO to remove common
        formatting issues like '\r' characters and unwrapped sequences.
        The new file is saved with the suffix '_clean.fasta'.
    '''

    # Give up early if the file does not look like fasta
    assert check_header_pattern(file), "Sorry, " + str(file) + " does not look like FASTA to me"

    suffix = '_clean.fa';
    (out_path,out_basename,out_ext) = general.parse_filename(file)
    if out_dir is not None:
        os.system("mkdir -p "+out_dir);
        out_path = out_dir

    fixed_file = out_path + '/' + out_basename + suffix
    out_handle = general.open_write_file(fixed_file)
    fasta_in  = SeqIO.parse(file,'fasta');

    # Iterate through the records to remove white-space
    # from the ID line
    new_records = []
    for record in fasta_in:
        header = re.sub('\s+','_',record.description)
        record.id = header
        record.name = header
        record.description = ''
        new_records.append(record)

    written = SeqIO.write(new_records, out_handle,'fasta')

    print str(written) + ' sequence records stored in ' + fixed_file

    return(fixed_file)

Beispiel #2

0

Datei anzeigen

Datei: fasta_o_matic.py Projekt: idworkin/read-cleaning-format-conversion

def fix_headers(fasta_file_name, qc_set_func, checked_qc_set_func, out_dir=None):
    '''
        Remove white spaces from the headers of a FASTA file. Fixed FASTA file
        is saved with the suffix '_h.fasta'.
    '''
    (out_path,out_basename,out_ext)=general.parse_filename(fasta_file_name)
    if out_dir is not None:
        out_path = out_dir # switch to user specified output directory
    file_with_header = out_path + '/' +  out_basename + '_h.fasta'
    broken_fasta=general.open_file(fasta_file_name)
    fixed_fasta=general.open_write_file(file_with_header)
    header_pattern = re.compile('^>.*')
    header = ''
    for line in broken_fasta:
        line=line.rstrip()
        if header_pattern.match(line):
            header = line
            header = re.sub('\s+', '_', header)
            line = header
        fixed_fasta.write(line + '\n')
    fixed_fasta.close()
    broken_fasta.close()
    remove_set = set(['header_whitespace']) # Remove qc step becuase it will be corrected
    # in the final FASTA file
    qc_set_func = qc_set_func.difference(remove_set) # skip finished repairs
    checked_qc_set_func = checked_qc_set_func.difference(remove_set) # skip finished checks
    return(file_with_header, qc_set_func, checked_qc_set_func)

Beispiel #3

0

Datei anzeigen

def fix_headers(fasta_file_name,
                qc_set_func,
                checked_qc_set_func,
                out_dir=None):
    '''
        Remove white spaces from the headers of a FASTA file. Fixed FASTA file
        is saved with the suffix '_h.fasta'.
    '''
    (out_path, out_basename, out_ext) = general.parse_filename(fasta_file_name)
    if out_dir is not None:
        out_path = out_dir  # switch to user specified output directory
    file_with_header = out_path + '/' + out_basename + '_h.fasta'
    broken_fasta = general.open_file(fasta_file_name)
    fixed_fasta = general.open_write_file(file_with_header)
    header_pattern = re.compile('^>.*')
    header = ''
    for line in broken_fasta:
        line = line.rstrip()
        if header_pattern.match(line):
            header = line
            header = re.sub('\s+', '_', header)
            line = header
        fixed_fasta.write(line + '\n')
    fixed_fasta.close()
    broken_fasta.close()
    remove_set = set(['header_whitespace'
                      ])  # Remove qc step becuase it will be corrected
    # in the final FASTA file
    qc_set_func = qc_set_func.difference(remove_set)  # skip finished repairs
    checked_qc_set_func = checked_qc_set_func.difference(
        remove_set)  # skip finished checks
    return (file_with_header, qc_set_func, checked_qc_set_func)

Beispiel #4

0

Datei anzeigen

Datei: fasta_o_matic.py Projekt: kstatebioinfo/stanford_swc

def fix_new_line(file, header_whitespace=False, out_dir=None):
    """
        Strips any new line character ('\\n' or '\\r') from each line in
        file and ends each line (including the last line) with a new 
        line character ('\\n').
    """
    suffix = "_ended.fasta"
    if header_whitespace:
        suffix = "_ended_h.fasta"  # make suffix match QC steps taken
    (out_path, out_basename, out_ext) = general.parse_filename(file)
    if out_dir is not None:
        out_path = out_dir  # switch to user specified output directory
    file_with_new_line = out_path + "/" + out_basename + suffix
    broken_fasta = general.open_file(file)
    fixed_fasta = general.open_write_file(file_with_new_line)
    header_pattern = re.compile("^>.*")
    header = ""
    for line in broken_fasta:
        line = line.rstrip()
        if header_pattern.match(line):
            header = line
            header = re.sub("\s+", "_", header)
            line = header
        fixed_fasta.write(line + "\n")
    fixed_fasta.close()
    broken_fasta.close()
    return file_with_new_line

Beispiel #5

0

Datei anzeigen

def fix_wrap(fasta_file_name, qc_set_func, checked_qc_set_func, out_dir=None):
    '''
        Wraps text in a FASTA file so that no line of sequence has more
        than 60 bases. Wrapped file is saved with the suffix '_wrap.fasta'.
    '''
    suffix = '_wrap.fasta'
    if 'header_whitespace' in qc_set_func:
        suffix = '_wrap_h.fasta'
    (out_path, out_basename, out_ext) = general.parse_filename(fasta_file_name)
    if out_dir is not None:
        out_path = out_dir  # switch to user specified output directory
    file_with_wrapping = out_path + '/' + out_basename + suffix
    fixed_fasta = general.open_write_file(file_with_wrapping)
    header_pattern = re.compile('^>.*')
    infile = general.open_file(fasta_file_name)
    dna = ''
    for line in infile:
        line = line.rstrip()
        if header_pattern.match(line):  # Print headers immediately to new file
            header = line
            if 'header_whitespace' in qc_set_func:
                header = re.sub('\s+', '_', header)
            if dna:
                fixed_fasta.write(dna + '\n')  # print remaining sequence
                # before header
            fixed_fasta.write(header + '\n')
            dna = ''  # Reset DNA
        else:  # if the line is sequence data dump sequence as fast as it is
            # long enough to wrap
            dna = dna + line
            while len(dna) > 59:  # Wrap sequence lines after
                # 60 bases
                wrap_line = dna[0:60]
                dna = dna[60:len(dna)]
                fixed_fasta.write(wrap_line + '\n')
    # Catch the last record
    else:  # For end of file
        if dna:
            fixed_fasta.write(dna + '\n')  # print remaining sequence
        # before header
    fixed_fasta.close()
    infile.close()
    remove_set = set(['wrap', 'new_line',
                      'header_whitespace'])  # Remove all three qc steps
    # becuase all will be corrected in the final FASTA file
    qc_set_func = qc_set_func.difference(remove_set)  # skip finished repairs
    checked_remove_set = set(['wrap'])
    checked_qc_set_func = checked_qc_set_func.difference(
        checked_remove_set)  # skip finished checks
    return (file_with_wrapping, qc_set_func, checked_qc_set_func)

Beispiel #6

0

Datei anzeigen

Datei: fasta_o_matic.py Projekt: idworkin/read-cleaning-format-conversion

def fix_wrap(fasta_file_name, qc_set_func, checked_qc_set_func, out_dir=None):
    '''
        Wraps text in a FASTA file so that no line of sequence has more
        than 60 bases. Wrapped file is saved with the suffix '_wrap.fasta'.
    '''
    suffix = '_wrap.fasta'
    if 'header_whitespace' in qc_set_func:
        suffix = '_wrap_h.fasta'
    (out_path,out_basename,out_ext)=general.parse_filename(fasta_file_name)
    if out_dir is not None:
        out_path = out_dir # switch to user specified output directory
    file_with_wrapping = out_path + '/' + out_basename + suffix
    fixed_fasta=general.open_write_file(file_with_wrapping)
    header_pattern = re.compile('^>.*')
    infile = general.open_file(fasta_file_name)
    dna    = ''
    for line in infile:
        line = line.rstrip()
        if header_pattern.match(line): # Print headers immediately to new file
            header = line
            if 'header_whitespace' in qc_set_func:
                header = re.sub('\s+', '_', header)
            if dna:
                fixed_fasta.write(dna + '\n') # print remaining sequence
                # before header
            fixed_fasta.write(header + '\n')
            dna = '' # Reset DNA
        else: # if the line is sequence data dump sequence as fast as it is
            # long enough to wrap
            dna = dna + line
            while len(dna) > 59: # Wrap sequence lines after
                # 60 bases
                wrap_line = dna[0:60]
                dna = dna[60:len(dna)]
                fixed_fasta.write(wrap_line + '\n')
    # Catch the last record
    else: # For end of file
        if dna:
            fixed_fasta.write(dna + '\n') # print remaining sequence
        # before header
    fixed_fasta.close()
    infile.close()
    remove_set = set(['wrap','new_line','header_whitespace']) # Remove all three qc steps
    # becuase all will be corrected in the final FASTA file
    qc_set_func = qc_set_func.difference(remove_set) # skip finished repairs
    checked_remove_set = set(['wrap'])
    checked_qc_set_func = checked_qc_set_func.difference(checked_remove_set) # skip finished checks
    return(file_with_wrapping, qc_set_func, checked_qc_set_func)

Beispiel #7

0

Datei anzeigen

Datei: fasta_o_matic.py Projekt: mckays630/read-cleaning-format-conversion

def fix_wrap(file, header_whitespace=False, out_dir=None):
    '''
        Wraps text in a FASTA file so that no line of sequence has more 
        than 60 bases. Wrapped file is saved with the suffix '_wrap.fasta'.
    '''
    suffix = '_wrap.fasta'
    if header_whitespace:
        suffix = '_wrap_h.fasta'
    (out_path,out_basename,out_ext)=general.parse_filename(file)
    if out_dir is not None:
        out_path = out_dir # switch to user specified output directory
    file_with_wrapping = out_path + '/' + out_basename + suffix
    fixed_fasta=general.open_write_file(file_with_wrapping)
    header_pattern = re.compile('^>.*')
    infile = general.open_file(file)
    header = '';
    dna    = '';
    records = []
    for line in infile:
        line = line.rstrip()
        if header_pattern.match(line):
            if dna:
                records.append([header,dna])
                dna = ''
            header = line
            if header_whitespace:
                header = re.sub('\s+', '_', header)
        else:
            dna = dna + line

    # Catch the last record
    if dna and header:
        records.append([header,dna])

    for record in records:
        header, dna = record
        fixed_fasta.write(header + '\n')
        wrap = textwrap.fill(dna,60) # Wrap sequence lines after 60 bases
        fixed_fasta.write(wrap + '\n')

    fixed_fasta.close()
    infile.close()

    return(file_with_wrapping)

Beispiel #8

0

Datei anzeigen

Datei: fasta_o_matic.py Projekt: kstatebioinfo/stanford_swc

def fix_wrap(file, header_whitespace=False, out_dir=None):
    """
        Wraps text in a FASTA file so that no line of sequence has more 
        than 60 bases. Wrapped file is saved with the suffix '_wrap.fasta'.
    """
    suffix = "_wrap.fasta"
    if header_whitespace:
        suffix = "_wrap_h.fasta"
    (out_path, out_basename, out_ext) = general.parse_filename(file)
    if out_dir is not None:
        out_path = out_dir  # switch to user specified output directory
    file_with_wrapping = out_path + "/" + out_basename + suffix
    fixed_fasta = general.open_write_file(file_with_wrapping)
    header_pattern = re.compile("^>.*")
    infile = general.open_file(file)
    dna = ""
    header = ""
    for line in infile:
        line = line.rstrip()
        if header_pattern.match(line):
            if not dna == "":  # skip the first (empty record)
                fixed_fasta.write(header + "\n")
                wrap = textwrap.fill(dna, 60)  # Wrap sequence lines after
                # 60 bases
                fixed_fasta.write(wrap + "\n")
            header = line
            if header_whitespace:
                header = re.sub("\s+", "_", header)  # Gets rid of
                # whitespace in the headers
            new_dna = next(infile)
            new_dna = new_dna.rstrip()
            dna = new_dna
        else:
            dna = dna + line
    else:  # For end of file
        fixed_fasta.write(header + "\n")
        wrap = textwrap.fill(dna, 60)  # Wrap sequence lines after
        # 60 bases
        fixed_fasta.write(wrap + "\n")
    fixed_fasta.close()
    infile.close()
    return file_with_wrapping

Beispiel #9

0

Datei anzeigen

def fix_new_line(fasta_file_name,
                 qc_set_func,
                 checked_qc_set_func,
                 out_dir=None):
    '''
        Strips any new line character ('\\n' or '\\r') from each line in
        file and ends each line (including the last line) with a new 
        line character ('\\n').
    '''
    suffix = '_ended.fasta'
    if 'header_whitespace' in qc_set_func:
        suffix = '_ended_h.fasta'  # make suffix match QC steps taken
    (out_path, out_basename, out_ext) = general.parse_filename(fasta_file_name)
    if out_dir is not None:
        out_path = out_dir  # switch to user specified output directory
    file_with_new_line = out_path + '/' + out_basename + suffix
    if sys.version_info > (3, 0):
        broken_fasta = general.open_file(fasta_file_name)
    else:
        broken_fasta = open(fasta_file_name, 'rU')
    fixed_fasta = general.open_write_file(file_with_new_line)
    header_pattern = re.compile('^>.*')
    header = ''
    for line in broken_fasta:
        line = line.rstrip()
        if header_pattern.match(line):
            header = line
            if 'header_whitespace' in qc_set_func:
                header = re.sub('\s+', '_', header)
            line = header
        fixed_fasta.write(line + '\n')
    fixed_fasta.close()
    broken_fasta.close()
    remove_set = set(['new_line', 'header_whitespace'])  # Remove both qc steps
    # becuase they will be corrected in the final FASTA file
    qc_set_func = qc_set_func.difference(remove_set)  # skip finished repairs
    checked_remove_set = set(['new_line'])
    checked_qc_set_func = checked_qc_set_func.difference(
        checked_remove_set)  # skip finished checks
    return (file_with_new_line, qc_set_func, checked_qc_set_func)

Beispiel #10

0

Datei anzeigen

def fix_fasta_file(file, out_dir=None):
    '''
        Passes a file through biopython SeqIO to remove common
        formatting issues like '\r' characters and unwrapped sequences.
        The new file is saved with the suffix '_clean.fasta'.
    '''

    # Give up early if the file does not look like fasta
    assert check_header_pattern(
        file), "Sorry, " + str(file) + " does not look like FASTA to me"

    suffix = '_clean.fa'
    (out_path, out_basename, out_ext) = general.parse_filename(file)
    if out_dir is not None:
        os.system("mkdir -p " + out_dir)
        out_path = out_dir

    fixed_file = out_path + '/' + out_basename + suffix
    out_handle = general.open_write_file(fixed_file)
    fasta_in = SeqIO.parse(file, 'fasta')

    # Iterate through the records to remove white-space
    # from the ID line
    new_records = []
    for record in fasta_in:
        header = re.sub('\s+', '_', record.description)
        record.id = header
        record.name = header
        record.description = ''
        new_records.append(record)

    written = SeqIO.write(new_records, out_handle, 'fasta')

    print str(written) + ' sequence records stored in ' + fixed_file

    return (fixed_file)

Beispiel #11

0

Datei anzeigen

Datei: fasta_o_matic.py Projekt: kstatebioinfo/stanford_swc

def fix_headers(file, out_dir=None):
    """
        Remove white spaces that break Trimmomatic and some other bioinfo tools 
        from the headers of a FASTA file. Fixed FASTA file is saved with the 
        suffix '_h.fasta'.
    """
    (out_path, out_basename, out_ext) = general.parse_filename(file)
    if out_dir is not None:
        out_path = out_dir  # switch to user specified output directory
    file_with_header = out_path + "/" + out_basename + "_h.fasta"
    broken_fasta = general.open_file(file)
    fixed_fasta = general.open_write_file(file_with_header)
    header_pattern = re.compile("^>.*")
    header = ""
    for line in broken_fasta:
        line = line.rstrip()
        if header_pattern.match(line):
            header = line
            header = re.sub("\s+", "_", header)
            line = header
        fixed_fasta.write(line + "\n")
    fixed_fasta.close()
    broken_fasta.close()
    return file_with_header

Beispiel #12

0

Datei anzeigen

Datei: fasta_o_matic.py Projekt: idworkin/read-cleaning-format-conversion

def fix_new_line(fasta_file_name, qc_set_func, checked_qc_set_func, out_dir=None):
    '''
        Strips any new line character ('\\n' or '\\r') from each line in
        file and ends each line (including the last line) with a new 
        line character ('\\n').
    '''
    suffix = '_ended.fasta'
    if 'header_whitespace' in qc_set_func:
        suffix = '_ended_h.fasta' # make suffix match QC steps taken
    (out_path,out_basename,out_ext)=general.parse_filename(fasta_file_name)
    if out_dir is not None:
        out_path = out_dir # switch to user specified output directory
    file_with_new_line = out_path + '/' +  out_basename + suffix
    if sys.version_info > (3, 0):
        broken_fasta=general.open_file(fasta_file_name)
    else:
        broken_fasta = open(fasta_file_name, 'rU')
    fixed_fasta=general.open_write_file(file_with_new_line)
    header_pattern = re.compile('^>.*')
    header = ''
    for line in broken_fasta:
        line=line.rstrip()
        if header_pattern.match(line):
            header = line
            if 'header_whitespace' in qc_set_func:
                header = re.sub('\s+', '_', header)
            line = header
        fixed_fasta.write(line + '\n')
    fixed_fasta.close()
    broken_fasta.close()
    remove_set = set(['new_line','header_whitespace']) # Remove both qc steps
    # becuase they will be corrected in the final FASTA file
    qc_set_func = qc_set_func.difference(remove_set) # skip finished repairs
    checked_remove_set = set(['new_line'])
    checked_qc_set_func = checked_qc_set_func.difference(checked_remove_set) # skip finished checks
    return(file_with_new_line, qc_set_func, checked_qc_set_func)

Beispiel #13

0

Datei anzeigen

Datei: fasta_o_matic.py Projekt: mckays630/read-cleaning-format-conversion

def fix_headers(file, out_dir=None):
    '''
        Remove white spaces that break Trimmomatic and some other bioinfo tools 
        from the headers of a FASTA file. Fixed FASTA file is saved with the 
        suffix '_h.fasta'.
    '''
    (out_path,out_basename,out_ext)=general.parse_filename(file)
    if out_dir is not None:
        out_path = out_dir # switch to user specified output directory
    file_with_header = out_path + '/' +  out_basename + '_h.fasta'
    broken_fasta=general.open_file(file)
    fixed_fasta=general.open_write_file(file_with_header)
    header_pattern = re.compile('^>.*')
    header = ''
    for line in broken_fasta:
        line=line.rstrip()
        if header_pattern.match(line):
            header = line
            header = re.sub('\s+', '_', header)
            line = header
        fixed_fasta.write(line + '\n')
    fixed_fasta.close()
    broken_fasta.close()
    return(file_with_header)

Beispiel #14

0

Datei anzeigen

Datei: clean_illumina.py Projekt: aeron15/read-cleaning-format-conversion

def main():
    '''
        Run full script as opposed to individual script functions.
    '''
    ######################################################################
    ############        Get commandline arguments             ############
    ######################################################################
    parser = argparse.ArgumentParser(
    description='DESCRIPTION: Summarize counts of all four DNA bases. \
                                     Command-line options that may be omitted \
                                     (i.e. are NOT required) are shown in \
                                     square brackets.')
    parser.add_argument('-v', '--verbose', action='store_true',
    dest='verbose', help='Runs reporting status updates',
    default=True)
    parser.add_argument('-q', '--quiet', action='store_false',
                     dest='verbose', help='Does not report status updates')
    parser.add_argument('-c', '--colorized',
                     help='Colorizes log reports. Use only if printing \
                     output to screen.',action='store_true',dest='colorized')
    parser.add_argument('-r', '--read_list', dest='read_list',
                        help='This is the the full path (path and filename) of \
                        the user provided list of read files. The file should \
                        be tab separated with the first read file, then the \
                        second read file (see example_read_list_PE.tab). If a \
                        sample has multiple fastq files for R1 and R2 separate \
                        these with commas (see example_read_list_PE_multi.tab).\
                        For single end reads each line should be a path \
                        to a fastq file. For single end reads each line should \
                        be a path to a fastq file (see example_read_list_SE.tab\
                        )', required=True)
    parser.add_argument('-p', '--project', dest='project',
                     help='The project id. This will be used to name output \
                        (default=project).', default='project', required=False)
    parser.add_argument('-a', '--adapter', dest='adapter',
                        help='The adapter fasta file. This will be used to \
                        clean reads',default='/homes/bioinfo_software/Trimmomatic-0.33/adapters/TruSeq3-PE-2.fa', required=False)
    parser.add_argument('-s', '--single_end', action='store_true', dest='single',
                        help='If your reads are single end use this flag. \
                        Without it the script assumes reads are paired end. \
                        Also skip the second column (the reverse fastq files) \
                        when making your read list', required=False,
                        default=False)
    parser.add_argument('-x', '--convert_header', action='store_true',
                        dest='convert_header', help='If the illumina headers \
                        do not end in /1 or /2 use this parameter to indicat \
                        that headers need to be converted. Check your headers \
                        by typing "head FASTA_FULL_PATH" and read more about \
                        illumina headers at \
                        http://en.wikipedia.org/wiki/Fastq#Illumina_sequence_identifiers.',
                        default=False, required=False)
    parser.add_argument('-m', '--min_read_length', dest='min_read_length',
                        help='The minimum read length in bp. (Default = 90).',
                        required=False, default=90)
    parser.add_argument('-o', '--out', dest='out',
                        help='Output directory (Default=$HOME)', required=False,
                        default='~')
    parser.add_argument('-d', '--dna', dest='sequence', help='DNA sequence to \
                        summarize', default='TATGAAGGGCGATGAATGCTATCTGTCCTGTAGAATTATAGAATCGACTACGTTGGGGAACTAATGGACCAGACAACTCGCTTTGACTGACGTAGACGGCGTGTTGT',
                        required=False)
    args = parser.parse_args()
    if args.colorized:
        import Colorer
    if args.verbose:
        doc()
        log.basicConfig(format='%(levelname)s:  %(message)s', level=log.DEBUG)
        log.info('Output is verbose. Run with -q, --quiet flag to suppress full output.')
    else:
        log.basicConfig(format='%(levelname)s: %(message)s')
    ######################################################################
    ############      Call custom functions with arguments     ###########
    ######################################################################
    # Get list of read FASTQ files
    #######################################
    print(args.read_list, args.single, args.min_read_length)
    (forwards,reverses) = trimmomatic_template.parse_file(args.read_list,
                                                          args.single)
    #######################################
    # Sanity check read FASTQ files
    #######################################
    index = 0
    for fastq in forwards:
        f_opened_file=general.open_file(forwards[index])
        f_opened_file.close()
        forwards[index] = general.convert_to_full(forwards[index])
        if not args.single:
            r_opened_file=general.open_file(reverses[index])
            r_opened_file.close()
            reverses[index] = general.convert_to_full(reverses[index])
        index += 1
    #######################################
    # Make output directory
    #######################################
    (out_path,out_basename,out_ext)=general.parse_filename(args.out)
    out_dir=out_path + '/' + out_basename
    general.path_check(out_dir) # Sanity check directory
    out_dir= out_dir + '/' + args.project # final out directory is 'project_id'
    general.mk_out_sub_directory(out_dir)
    general.mk_out_sub_directory(out_dir + '/scripts')
    general.mk_out_sub_directory(out_dir + '/qsubs')
    #######################################
    # Write trimmomatic script
    #######################################
    convert=' | awk \'{if (NR % 4 == 1) {split($1, arr, \":\"); printf \"%s_%s:%s:%s:%s:%s#0/%s\\n\", arr[1], arr[3], arr[4], arr[5], arr[6], arr[7], substr($2, 1, 1), $0} else if (NR % 4 == 3){print \"+\"} else {print $0} }\' > '
    qsub_script = general.open_write_file(out_dir + '/qsubs/qsub_trimmomatic.sh')
    qsub_script.write('#!/bin/bash\n')
    index=0
    args.adapter = fasta_o_matic.run_steps(args.adapter,['wrap', 'new_line','header_whitespace'])
    for fastq in forwards:
        (f_path,f_basename,f_ext)=general.parse_filename(forwards[index])
        qsub_script.write('qsub -l mem=4G,h_rt=6:00:00 -pe single 16 '+ out_dir
                          + '/scripts/run_trimmomatic_' + f_basename + '.sh\n' )
        if not args.single:
            (r_path,r_basename,r_ext)=general.parse_filename(reverses[index])
        trim_script = general.open_write_file(out_dir
                                              + '/scripts/run_trimmomatic_'
                                              + f_basename + '.sh')
        trim_script.write('#!/bin/bash\n')
        # Convert headers
        if args.convert_header:
            trim_script.write('# Convert headers:\n')
            new_forward_fastq = out_dir + '/' + f_basename + '_h.fastq'
            trim_script.write('cat ' + forwards[index] + convert
                              + new_forward_fastq + '\n')
            forwards[index] = new_forward_fastq
            if not args.single:
                new_reverse_fastq = out_dir + '/' + r_basename + '_h.fastq'
                trim_script.write('cat ' + reverses[index] + convert
                                  + new_reverse_fastq + '\n')
                reverses[index] = new_reverse_fastq
        # Trim sequences
        trim_script.write('# Clean reads:\n')
        if not args.single:
            trim_script.write(trimmomatic_template.trim_template(
                                                                 forwards[index],
                                                                 reverses[index],
                                                                 args.adapter,
                                                                 out_dir))
        else:
            trim_script.write(trimmomatic_template.trim_template_single(forwards[index]))
            # Section in progress... (Remember to point to a SE adapter fasta file
            # by default)
        trim_script.close()
        index += 1
    qsub_script.close()

Beispiel #15

0

Datei anzeigen

def main():
    '''
        Run full script as opposed to individual script functions.
    '''
    ######################################################################
    ############        Get commandline arguments             ############
    ######################################################################
    parser = argparse.ArgumentParser(
        description='DESCRIPTION: Summarize counts of all four DNA bases. \
                                     Command-line options that may be omitted \
                                     (i.e. are NOT required) are shown in \
                                     square brackets.')
    parser.add_argument('-v',
                        '--verbose',
                        action='store_true',
                        dest='verbose',
                        help='Runs reporting status updates',
                        default=True)
    parser.add_argument('-q',
                        '--quiet',
                        action='store_false',
                        dest='verbose',
                        help='Does not report status updates')
    parser.add_argument('-c',
                        '--colorized',
                        help='Colorizes log reports. Use only if printing \
                     output to screen.',
                        action='store_true',
                        dest='colorized')
    parser.add_argument(
        '-r',
        '--read_list',
        dest='read_list',
        help='This is the the full path (path and filename) of \
                        the user provided list of read files. The file should \
                        be tab separated with the first read file, then the \
                        second read file (see example_read_list_PE.tab). If a \
                        sample has multiple fastq files for R1 and R2 separate \
                        these with commas (see example_read_list_PE_multi.tab).\
                        For single end reads each line should be a path \
                        to a fastq file. For single end reads each line should \
                        be a path to a fastq file (see example_read_list_SE.tab\
                        )',
        required=True)
    parser.add_argument(
        '-p',
        '--project',
        dest='project',
        help='The project id. This will be used to name output \
                        (default=project).',
        default='project',
        required=False)
    parser.add_argument(
        '-a',
        '--adapter',
        dest='adapter',
        help='The adapter fasta file. This will be used to \
                        clean reads',
        default=
        '/homes/bioinfo_software/Trimmomatic-0.33/adapters/TruSeq3-PE-2.fa',
        required=False)
    parser.add_argument('-s',
                        '--single_end',
                        action='store_true',
                        dest='single',
                        help='If your reads are single end use this flag. \
                        Without it the script assumes reads are paired end. \
                        Also skip the second column (the reverse fastq files) \
                        when making your read list',
                        required=False,
                        default=False)
    parser.add_argument('-x',
                        '--convert_header',
                        action='store_true',
                        dest='convert_header',
                        help='If the illumina headers \
                        do not end in /1 or /2 use this parameter to indicat \
                        that headers need to be converted. Check your headers \
                        by typing "head FASTA_FULL_PATH" and read more about \
                        illumina headers at \
                        http://en.wikipedia.org/wiki/Fastq#Illumina_sequence_identifiers.',
                        default=False,
                        required=False)
    parser.add_argument('-m',
                        '--min_read_length',
                        dest='min_read_length',
                        help='The minimum read length in bp. (Default = 90).',
                        required=False,
                        default=90)
    parser.add_argument('-o',
                        '--out',
                        dest='out',
                        help='Output directory (Default=$HOME)',
                        required=False,
                        default='~')
    parser.add_argument(
        '-d',
        '--dna',
        dest='sequence',
        help='DNA sequence to \
                        summarize',
        default=
        'TATGAAGGGCGATGAATGCTATCTGTCCTGTAGAATTATAGAATCGACTACGTTGGGGAACTAATGGACCAGACAACTCGCTTTGACTGACGTAGACGGCGTGTTGT',
        required=False)
    args = parser.parse_args()
    if args.colorized:
        import Colorer
    if args.verbose:
        doc()
        log.basicConfig(format='%(levelname)s:  %(message)s', level=log.DEBUG)
        log.info(
            'Output is verbose. Run with -q, --quiet flag to suppress full output.'
        )
    else:
        log.basicConfig(format='%(levelname)s: %(message)s')
    ######################################################################
    ############      Call custom functions with arguments     ###########
    ######################################################################
    # Get list of read FASTQ files
    #######################################
    print(args.read_list, args.single, args.min_read_length)
    (forwards,
     reverses) = trimmomatic_template.parse_file(args.read_list, args.single)
    #######################################
    # Sanity check read FASTQ files
    #######################################
    index = 0
    for fastq in forwards:
        f_opened_file = general.open_file(forwards[index])
        f_opened_file.close()
        forwards[index] = general.convert_to_full(forwards[index])
        if not args.single:
            r_opened_file = general.open_file(reverses[index])
            r_opened_file.close()
            reverses[index] = general.convert_to_full(reverses[index])
        index += 1
    #######################################
    # Make output directory
    #######################################
    (out_path, out_basename, out_ext) = general.parse_filename(args.out)
    out_dir = out_path + '/' + out_basename
    general.path_check(out_dir)  # Sanity check directory
    out_dir = out_dir + '/' + args.project  # final out directory is 'project_id'
    general.mk_out_sub_directory(out_dir)
    general.mk_out_sub_directory(out_dir + '/scripts')
    general.mk_out_sub_directory(out_dir + '/qsubs')
    #######################################
    # Write trimmomatic script
    #######################################
    convert = ' | awk \'{if (NR % 4 == 1) {split($1, arr, \":\"); printf \"%s_%s:%s:%s:%s:%s#0/%s\\n\", arr[1], arr[3], arr[4], arr[5], arr[6], arr[7], substr($2, 1, 1), $0} else if (NR % 4 == 3){print \"+\"} else {print $0} }\' > '
    qsub_script = general.open_write_file(out_dir +
                                          '/qsubs/qsub_trimmomatic.sh')
    qsub_script.write('#!/bin/bash\n')
    index = 0
    args.adapter = fasta_o_matic.run_steps(
        args.adapter, ['wrap', 'new_line', 'header_whitespace'])
    for fastq in forwards:
        (f_path, f_basename, f_ext) = general.parse_filename(forwards[index])
        qsub_script.write('qsub -l mem=4G,h_rt=6:00:00 -pe single 16 ' +
                          out_dir + '/scripts/run_trimmomatic_' + f_basename +
                          '.sh\n')
        if not args.single:
            (r_path, r_basename,
             r_ext) = general.parse_filename(reverses[index])
        trim_script = general.open_write_file(out_dir +
                                              '/scripts/run_trimmomatic_' +
                                              f_basename + '.sh')
        trim_script.write('#!/bin/bash\n')
        # Convert headers
        if args.convert_header:
            trim_script.write('# Convert headers:\n')
            new_forward_fastq = out_dir + '/' + f_basename + '_h.fastq'
            trim_script.write('cat ' + forwards[index] + convert +
                              new_forward_fastq + '\n')
            forwards[index] = new_forward_fastq
            if not args.single:
                new_reverse_fastq = out_dir + '/' + r_basename + '_h.fastq'
                trim_script.write('cat ' + reverses[index] + convert +
                                  new_reverse_fastq + '\n')
                reverses[index] = new_reverse_fastq
        # Trim sequences
        trim_script.write('# Clean reads:\n')
        if not args.single:
            trim_script.write(
                trimmomatic_template.trim_template(forwards[index],
                                                   reverses[index],
                                                   args.adapter, out_dir))
        else:
            trim_script.write(
                trimmomatic_template.trim_template_single(forwards[index]))
            # Section in progress... (Remember to point to a SE adapter fasta file
            # by default)
        trim_script.close()
        index += 1
    qsub_script.close()