Ejemplo n.º 1
0
def main(name_space):
    import argparse
    from lib import File_IO
    import textwrap
    
    parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
                                     epilog=textwrap.dedent('''\
                                        ------------------------
                                        By Zewei Song
                                        University of Minnesota
                                        Dept. Plant Pathology
                                        [email protected]
                                        ------------------------'''), prog = 'correct_fasta')
    parser.add_argument("-i", "--input", help="FASTA file need to be fixed")
    parser.add_argument("-o", "--output", help="Name of the output FASTA file")
    parser.add_argument('-head', default='>',help='Specify a head symbol if not >')
    args = parser.parse_args(name_space)
    
    file_origin = args.input
    if args.output:
        file_corrected = args.output
    else:
        file_corrected = 'corrected_'+file_origin
    head = args.head
    
    fasta_corrected = File_IO.read_fasta_multiline(File_IO.read_file(file_origin),head_symbol=head)
    count = File_IO.write_seqs(fasta_corrected,file_corrected,checker=False,overwrite=True)
    
    print 'Checked %d sequences in %s and saved in %s.' % (count, file_origin, file_corrected)
Ejemplo n.º 2
0
def main(name_space):
    import argparse
    import textwrap
    from lib import File_IO

    parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
                                     epilog=textwrap.dedent('''\
                                        ------------------------
                                        By Zewei Song
                                        University of Minnesota
                                        Dept. Plant Pathology
                                        [email protected]
                                        ------------------------'''), prog = '-filter_database')

    parser.add_argument("-i", "--input", help="Name of the input FASTA file.")
    parser.add_argument("-o", "--output", help="Name of the output FASTA file")
    args = parser.parse_args(name_space)

    database = File_IO.read_seqs(args.input)
    count = len(database)
    print "Reading in %s ..." % args.input
    print "%s contains %i records." % (args.input, count)
    
    count_filter = 0
    database_cleaned = []
    for record in database:
        if record[0].find('unidentified') == -1: # check if current record contain 'unidentified' taxonomic level.
            database_cleaned.append(record)
            count_filter += 1
    
    print "%i records contain 'unidentified' string." % (count - count_filter)
    
    count_write = File_IO.write_seqs(database_cleaned, args.output)
    print "Filtered database is saved in %s with %i records." % (args.output, count_write)
Ejemplo n.º 3
0
def main(Namespace):
    import argparse
    from lib import File_IO
    import sys
    import os
    import time
    import textwrap
    
    parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
                                     epilog=textwrap.dedent('''\
                                    ------------------------
                                    By Zewei Song
                                    University of Minnesota
                                    Dept. Plant Pathology
                                    [email protected]
                                    ------------------------'''), prog='fast.py -merge_seqs')
    parser.add_argument('-i', '--input', help='Name of the input folder containing files to be merged')
    parser.add_argument('-o', '--output', help='Name of the merged file')
    group = parser.add_mutually_exclusive_group()
    group.add_argument('-fasta', action='store_true', help='Set the file type to FASTA')
    group.add_argument('-fastq', action='store_true', help='Set the file type to FASTQ, this is the default option')
    args = parser.parse_args(Namespace)
    
    input_folder = args.input
    if not input_folder:
        print('please specified an input folder')
        sys.exit()
    output_file = args.output
    if not output_file:
        print('Please specified an output file.')
        sys.exit()
    if os.path.isfile(output_file):
        file_size = round(os.path.getsize(output_file)/1024**2, 0)
        exist = raw_input('%s (%d MB)already exists , do you want to overwrite it? [y/n]' % (output_file, file_size))
        if exist == 'y' or exist == 'Y':
            os.remove(output_file)
        else:
            print('Program stopped.')
            sys.exit()
    file_type = 'fastq'
    if args.fasta:
        file_type = 'fasta'
    
    start = time.time()
    f_list = File_IO.file_list(input_folder)
    f_list.sort()
    print('Found %i files in the folder %s' % (len(f_list), input_folder))
    count = 0
    n = 1
    count_total = 0
    for seq_file in f_list:
        current_file = input_folder+'/'+seq_file
        count = File_IO.write_seqs(File_IO.read_seqs(current_file, file_type), output_file, checker=False, overwrite=False)
        print('%d. Merged %d sequences from %s into the new file.' % (n, count, seq_file))
        n += 1
        count_total += count
    end = time.time()
    used_time = round(end-start, 2)
    print('Spent %s sec to merge %d records in %d files into %s' % (str(used_time), count_total, len(f_list), output_file))
Ejemplo n.º 4
0
def main(name_space):
    from lib import random_subsample as rs
    from lib import File_IO
    import argparse
    import textwrap
    import sys
    parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
                                     epilog=textwrap.dedent('''\
                                        ------------------------
                                        By Zewei Song
                                        University of Minnesota
                                        Dept. Plant Pathology
                                        [email protected]
                                        ------------------------'''), prog='fast.py -random_subsample')
    parser.add_argument('-r1', help='Name of the Read1 file.')
    parser.add_argument('-r2', help='Name of the Read2 file if applicable.')
    parser.add_argument('-size', default = 10000, help='Sampling size for each file, default=10,000.')
    
    args = parser.parse_args(name_space)
    
    read1 = args.r1
    if args.r2:
        read2 = args.r2
    sample_size = int(args.size)
    
    read1_content = File_IO.read_seqs(read1)
    total_size = len(read1_content)
    
    file_type = "fasta"
    if read1_content[0][2] == "+":
        file_type = "fastq"
    
    if sample_size > total_size:
        print('The specified sampling size is larger than the total number of sequences.')
        sys.exit()
    else:
        seq_index = rs.generate_random_index(total_size, sample_size)
    
    # Get sequences in read1 file
    read1_picked = []
    for index in seq_index:
        read1_picked.append(read1_content[index])
    
    # Pick read1 file is the filename is specified
    if args.r2:
        read2_content = File_IO.read_seqs(read2)
        read2_picked = []
        for index in seq_index:        
            read2_picked.append(read2_content[index])

    # write to new files
    read1_output = "R1."+file_type
    read1_count = File_IO.write_seqs(read1_picked, read1_output, checker=False, overwrite=True)
    print('{0} sequences have been randomly picked from {1}, and saved in {2}.'.format(read1_count, read1, read1_output))
    if args.r2:
        read2_output = "R2."+file_type
        read2_count = File_IO.write_seqs(read2_picked, read2_output, checker=False, overwrite=True)
        print('{0} sequences have been randomly picked from {1}, and saved in {2}.'.format(read2_count, read2, read2_output))
Ejemplo n.º 5
0
def main(name_space):
    import argparse
    import textwrap
    from lib import File_IO
    import os
    import sys

    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=textwrap.dedent('''\
                                        ------------------------
                                        By Zewei Song
                                        University of Minnesota
                                        Dept. Plant Pathology
                                        [email protected]
                                        ------------------------'''),
        prog='fast.py -add_labels')
    parser.add_argument(
        "-i",
        "--input",
        help="Name of the input file, merging from multiple sample sequences.")
    parser.add_argument("-o", "--output", help="Name of the output folder.")
    parser.add_argument("-r",
                        "--read",
                        choices=['r1', 'read1', 'r2', 'read2'],
                        help="Read direction, read1 or read2.")
    args = parser.parse_args(name_space)
    #args = argparse.Namespace(input = 'read1.cut2.fastq', output = 'unmerged', read = 'read1') # This line is for testing purpose

    input_file = args.input
    output_folder = args.output
    read_type = args.read

    if read_type == 'r1' or read_type == 'read1':
        read_type = 'R1'
    elif read_type == 'r2' or read_type == 'read2':
        read_type = 'R2'
    else:
        print('Please specify the correct read type using the -r option.')
        sys.exit()

    os.makedirs(output_folder, exist_ok=True)
    input_seqs = File_IO.read_seqs(input_file)
    output_records = {}

    for record in input_seqs:
        sample_name = record[0][0:record[0].index('_')]
        try:
            output_records[sample_name].append(record)
        except KeyError:
            output_records[sample_name] = [record]

    for key, value in output_records.items():
        output_file = output_folder + '/' + key + '_' + read_type + '.fastq'
        File_IO.write_seqs(value, output_file, checker=False, overwrite=True)
Ejemplo n.º 6
0
def main(name_space):
    import argparse
    import textwrap
    from lib import File_IO
    import time
    import sys

    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=textwrap.dedent('''\
                                        ------------------------
                                        By Zewei Song
                                        University of Minnesota
                                        Dept. Plant Pathology
                                        [email protected]
                                        ------------------------'''))
    parser.add_argument("-i", "--input", help="Convert a FASTQ file.")
    parser.add_argument("-o", "--output", help="Name of the output FASTA file")
    #parser.add_argument("-q", "--qual", action="store_true", help="Output Qual file")
    args = parser.parse_args(name_space)

    fasta_file = args.output
    #qual = args.qual

    if args.input:
        fastq_file = args.input
        start = time.time()
        print("Loading %s ..." % fastq_file)
        fasta_content = File_IO.read_seqs(fastq_file,
                                          file_type='fastq',
                                          output='fasta')
        print('Converting to FASTA ...')
        record_num = File_IO.write_seqs(fasta_content,
                                        fasta_file,
                                        checker=False,
                                        overwrite=True)
        print("Converted %d records in %s ..." % (record_num, fastq_file))

        end = time.time()
        used_time = round(end - start, 2)
        print(
            "It took %s sec to convert (%s seqs/s).\nFASTA file saved in %s." %
            (str(used_time), str(round(record_num / used_time,
                                       0)), fasta_file))
    #    if qual:
    #        print "Quality scores saved in %s." % (File_IO.name_file(fasta_file, '', 'qual'))

    else:
        print("Please specify a FASTQ file.")
        sys.exit()
Ejemplo n.º 7
0
def main(name_space):
    from lib import random_subsample as rs
    from lib import File_IO
    import argparse
    import textwrap
    parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
                                     epilog=textwrap.dedent('''\
                                        ------------------------
                                        By Zewei Song
                                        University of Minnesota
                                        Dept. Plant Pathology
                                        [email protected]
                                        ------------------------'''), prog='fast.py -random_subsample')
    parser.add_argument('-i', '--input', help='Name of the input folder with raw data')
    parser.add_argument('-o', '--output', default = 'random_dataset', help='Name of the output folder with raw data')
    parser.add_argument('-file_number', default = 10, help='Number of file to pick.')
    parser.add_argument('-size', default = 10000, help='Sampling size for each file.')
    
    args = parser.parse_args(name_space)
    
    input_folder = args.input
    output_folder = args.output
    
    file_number = int(args.file_number)
    sample_size = int(args.size)
    
    # Create new folder
    File_IO.mk_dir(output_folder)
    
    # Randomly pick files to be sampled
    input_file_list = File_IO.file_list(input_folder)
    print('Found {0} files in the folder {1}'.format(len(input_file_list), input_folder), end = '\n')
    file_index = rs.generate_random_index(len(input_file_list), file_number)
    file_list = []
    for index in file_index:
        file_list.append(input_file_list[index])
    
    # Randomly pick sequences from each file
    for raw_file in file_list:
        print('\tRandoming sampling {0} for {1} sequences ...'.format(raw_file, sample_size, end='\r'))
        current_content = File_IO.read_seqs(input_folder + '/' + raw_file)
        seq_index = rs.generate_random_index(len(current_content), sample_size)
        sampled_content = []

        for index in seq_index:
            sampled_content.append(current_content[index])
        
        count = File_IO.write_seqs(sampled_content, output_folder + '/' + raw_file)
    
    print('A randomly sampled dataset ({0} files, {1} sequences per file) was generated under the folder {2}'.format(file_number, sample_size, output_folder, end = '\n'))
Ejemplo n.º 8
0
def main(name_space):
    import argparse
    import textwrap
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=textwrap.dedent('''\
                                        ------------------------
                                        By Zewei Song
                                        University of Minnesota
                                        Dept. Plant Pathology
                                        [email protected]
                                        ------------------------'''),
        prog='fast.py -otu_deconstruct')
    parser.add_argument('-map', help='Name of the FAST-derep map.')
    parser.add_argument('-o',
                        '--output',
                        default='otu_deconstruct',
                        help='Name of the output folder')

    args = parser.parse_args(name_space)

    input_map_file = args.map
    output_folder = args.output

    from lib import ParseOtuMap
    from lib import File_IO

    File_IO.mk_dir(output_folder)

    input_map = ParseOtuMap.read_fast_output(input_map_file)

    input_map = ParseOtuMap.fast_output_parser(input_map)

    input_map_size = input_map.unit_count
    print('{0} contains {1} OTUs.'.format(input_map_file, input_map_size))

    otu_list = input_map.get_seqs()  # get a list of otu with their sequences

    for unit in otu_list:
        output_file = output_folder + '/' + unit[0] + '.txt'
        current_otu = input_map.detail_sample_unit(unit[0])

        print('\tWriting: {0} ...\r'.format(output_file, end='\r'))
        with open(output_file, 'wb') as f:
            for line in current_otu:
                line = '\t'.join([str(i) for i in line])
                f.write('%s\n' % line)

    print('All files wrote to the folder: {0}.'.format(output_folder))
Ejemplo n.º 9
0
def ReLabelFastQ(file_name,
                 label,
                 read_type,
                 input_folder,
                 output_folder='labeled',
                 file_type='fastq',
                 label_type='qiime'):
    #%% Read in sequence file and change the header
    from lib import File_IO

    file_content = File_IO.read_seqs(input_folder + '/' + file_name,
                                     file_type=file_type)
    head_symbol = '@'
    if len(file_content[0]) == 2:
        head_symbol == '>'

    count = 0
    for record in file_content:  #Loop through header of the records
        record[0] = ChangeName(label, count, read_type, label_type=label_type)
        count += 1

    file_labeled = output_folder + '/labeled_' + file_name
    with open(file_labeled, 'w') as f:
        for record in file_content:
            record[0] = head_symbol + record[
                0]  # Add head symbol to sequence name
            for line in record:
                f.write('%s\n' % line)
    return count
Ejemplo n.º 10
0
def MainLabelFiles(mapping_file,
                   input_folder,
                   threads=1,
                   output_folder='labeled',
                   file_type='fastq',
                   label_type='both'):
    #Create a new folder for relabeled files
    from lib import File_IO

    File_IO.mk_dir(output_folder)
    if threads == 1:
        print("Relabeling files using %d thread ..." % threads)
        mapping = ParseMapping(mapping_file, input_folder)
        file_num = len(mapping)
        for item in mapping:
            count = ReLabelFastQ(item['file'], item['label'], item['read_type'], item['input_folder'], \
                                 output_folder=output_folder, file_type=file_type, label_type=label_type)
            print("%s sequences in %s relabeled to %s as %s file.\n" %
                  (count, item['file'], item['label'], item['read_type']))

    elif threads > 1:
        print("Relabeling files using %d threads ..." % threads)
        mapping_multithreads = SplitMapping(mapping_file,
                                            input_folder,
                                            output_folder=output_folder,
                                            file_type=file_type,
                                            label_type=label_type,
                                            processor=threads)
        file_num = sum([len(i) for i in mapping_multithreads])
        worker = CreateWorker(mapping_multithreads, threads=threads)

        for item in worker:
            # Start workers
            item.start()
        for item in worker:
            # Wait until all workers finishes
            item.join()

    else:
        print("The number of threads cannot be negative.")
        import sys

        sys.exit()
    return file_num


#%%
Ejemplo n.º 11
0
def main(name_space):
    import argparse
    import textwrap
    from lib import File_IO
    import os
    import sys

    parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
                                     epilog=textwrap.dedent('''\
                                        ------------------------
                                        By Zewei Song
                                        University of Minnesota
                                        Dept. Plant Pathology
                                        [email protected]
                                        ------------------------'''), prog='fast.py -add_labels')
    parser.add_argument("-i", "--input", help="Name of the input file, merging from multiple sample sequences.")
    parser.add_argument("-o", "--output", help="Name of the output folder.")
    parser.add_argument("-r", "--read", choices = ['r1', 'read1', 'r2', 'read2'], help="Read direction, read1 or read2.")
    args = parser.parse_args(name_space)
    #args = argparse.Namespace(input = 'read1.cut2.fastq', output = 'unmerged', read = 'read1') # This line is for testing purpose

    input_file = args.input
    output_folder = args.output
    read_type = args.read

    if read_type == 'r1' or read_type == 'read1':
        read_type = 'R1'
    elif read_type == 'r2' or read_type == 'read2':
        read_type = 'R2'
    else:
        print('Please specify the correct read type using the -r option.')
        sys.exit()

    os.makedirs(output_folder, exist_ok = True)
    input_seqs = File_IO.read_seqs(input_file)
    output_records = {}

    for record in input_seqs:
        sample_name = record[0][0:record[0].index('_')]
        try:
            output_records[sample_name].append(record)
        except KeyError:
            output_records[sample_name] = [record]

    for key, value in output_records.items():
        output_file = output_folder + '/' + key + '_' + read_type + '.fastq'
        File_IO.write_seqs(value, output_file, checker = False, overwrite = True)
Ejemplo n.º 12
0
def read_otu_map(filename):
    from lib import File_IO
    OtuMap = File_IO.read_file(filename)
    MapDict = {}
    for line in OtuMap:
        names = line.strip('\n').split('\t')
        MapDict[names[0]] = names[1:]
    return MapDict
Ejemplo n.º 13
0
def read_otu_map(filename):
    from lib import File_IO
    OtuMap = File_IO.read_file(filename)
    MapDict = {}
    for line in OtuMap:
        names = line.strip('\n').split('\t')
        MapDict[names[0]]=names[1:]
    return MapDict
Ejemplo n.º 14
0
def main(name_space):
    import argparse
    import textwrap
    from lib import ParseOtuMap
    from lib import File_IO
    
    parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
                                     epilog=textwrap.dedent('''\
                                    ------------------------
                                    By Zewei Song
                                    University of Minnesota
                                    Dept. Plant Pathology
                                    [email protected]
                                    ------------------------'''), prog='fast.py -subset_fast_hybrid')
    parser.add_argument('-i', '--input', help='Input of FAST hybrid map.')
    parser.add_argument('-o', '--output', help='Output prefix for the derep map and sequence')
    group = parser.add_mutually_exclusive_group()
    group.add_argument('-otu_list', help='A list of OTU names seperated by ","')
    group.add_argument('-otu_file', help='A file contains a list of OTU names (no header)')
    args = parser.parse_args(name_space)
    
    print ('Subtracting a FAST hybrid map with provieded OTU names ...')    
    input_file = args.input
    output_derep = args.output + '.txt'
    output_fasta = args.output + '.fasta'
    
    otu_list = []
    if args.otu_list:
        otu_list = args.otu_list.split(',')
    elif args.otu_file:
        otu_list = []
        with open(args.otu_file) as f:
            for line in f:
                otu_list.append(line)
    print ('Found {0} OTU names.'.format(len(otu_list)))

    print ('Reading in the FAST hybrid map: {0} ...'.format(input_file))
    hybrid_map = ParseOtuMap.read_fast_output(input_file)
    fast_derep = {}
    for otu in otu_list:
        fast_derep.update(hybrid_map[otu]['sample'])
    ParseOtuMap.write_fast_output(fast_derep, output_derep)
    print ('A FAST derep map wrote to: {0}.'.format(output_derep))    
    
    derep_seq = []
    for key, value in fast_derep.items():
        current_seq = []
        derep_size = sum(value['sample'].values())
        seq_label = key + ';size=' + str(derep_size)
        current_seq = [derep_size, seq_label, value['seq']]
        derep_seq.append(current_seq) 
    
    derep_seq.sort(reverse=True)
    derep_seq = [i[1:] for i in derep_seq]

    count = File_IO.write_seqs(derep_seq, output_fasta, checker=False)
    print ('A dereplicated FASTA file wrote to {0}, containing {1} sequences with size annotation.'.format(output_fasta, count))
    print ('\n')
Ejemplo n.º 15
0
def main(Namespace):
    from lib import File_IO
    from lib import ParseOtuMap
    import argparse
    import textwrap
    import sys

    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=textwrap.dedent('''\
                                    ------------------------
                                    By Zewei Song
                                    University of Minnesota
                                    Dept. Plant Pathology
                                    [email protected]
                                    ------------------------'''),
        prog='fast.py -add_seqs_size')
    parser.add_argument('-i', '--input', help='Input FASTA file')
    parser.add_argument('-map', help='Input OTU map file')
    parser.add_argument('-o', '--output', help='Output FASTA file')
    args = parser.parse_args(Namespace)

    input_fasta = args.input
    input_map = args.map
    output_fasta = args.output

    print 'Reading in OTU map ...'
    otu_map = ParseOtuMap.read_otu_map(input_map)
    print 'Reaidng in sequence file ...'
    fasta = File_IO.read_seqs(input_fasta)
    print 'Found %i OTUs in the map file, found %i sequences in the sequence file.' % (
        len(otu_map), len(fasta))

    count = 0
    print 'Adding size annotation ...'
    for record in fasta:
        try:
            size = len(otu_map[record[0]])
            if record[0][-1] == ';':
                record[0] += ('size=%i;' % size)
            else:
                record[0] += (';size=%i;' % size)
            record.append(size)
            count += 1
            print 'Annotating %i sequence ...' % count + '\b' * 100,
        except KeyError:
            print "Can not find %s in the OTU map file." % record[0]
            sys.exit()
    print
    print 'Sorting the annotated sequences ...'
    fasta.sort(key=lambda x: x[-1], reverse=True)

    print 'Writing to a new FASTA file ...'
    with open(output_fasta, 'w') as f:
        for record in fasta:
            f.write('>%s\n' % record[0])
            f.write('%s\n' % record[1])
    print 'Sequences with size annotations saved in %s.' % output_fasta
Ejemplo n.º 16
0
def main(name_space):
    import argparse
    import textwrap
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=textwrap.dedent('''\
                                        ------------------------
                                        By Zewei Song
                                        University of Minnesota
                                        Dept. Plant Pathology
                                        [email protected]
                                        ------------------------'''),
        prog='fast.py -generate_fast_map')
    parser.add_argument('-map',
                        help='Name of the Qiime style OTU/Derep map file')
    parser.add_argument(
        '-seq',
        help='Name of the sequence file corresponding to the Qiime map.')
    parser.add_argument('-o', '--output', help='Name of the output FAST map.')
    group = parser.add_mutually_exclusive_group()
    group.add_argument('-derep',
                       action='store_true',
                       help='Indicate the source is a dereplication map.')
    group.add_argument('-otu',
                       action='store_true',
                       help='Indicate the source is an OTU map.')
    parser.add_argument(
        '-separator',
        default=';',
        help='Set the separator for parsing the sequence label.')

    args = parser.parse_args(name_space)

    input_map_file = args.map
    input_seq_file = args.seq
    output_map_file = args.output
    separator = args.separator

    if args.derep:
        real_sample = True
    elif args.otu:
        real_sample = False

    from lib import ParseOtuMap
    from lib import File_IO

    input_map = ParseOtuMap.read_otu_map(input_map_file)
    input_seq = File_IO.read_seqs(input_seq_file)

    output_map = ParseOtuMap.generate_fast_output(input_map,
                                                  input_seq,
                                                  real_sample=real_sample,
                                                  separator=separator)

    ParseOtuMap.write_fast_output(output_map, output_map_file)

    print('FAST style map file wrote to %s.' % output_map_file)
Ejemplo n.º 17
0
def main(name_space):
    import argparse
    import textwrap
    parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
                                     epilog=textwrap.dedent('''\
                                        ------------------------
                                        By Zewei Song
                                        University of Minnesota
                                        Dept. Plant Pathology
                                        [email protected]
                                        ------------------------'''), prog = 'fast.py -otu_deconstruct')
    parser.add_argument('-map', help='Name of the FAST-derep map.')
    parser.add_argument('-o', '--output', default = 'otu_deconstruct', help='Name of the output folder')

    args = parser.parse_args(name_space)
    
    input_map_file = args.map
    output_folder = args.output
    
    from lib import ParseOtuMap
    from lib import File_IO
    
    File_IO.mk_dir(output_folder)
    
    input_map = ParseOtuMap.read_fast_output(input_map_file)
    
    input_map = ParseOtuMap.fast_output_parser(input_map)
    
    input_map_size = input_map.unit_count
    print('{0} contains {1} OTUs.'.format(input_map_file, input_map_size))
    
    otu_list = input_map.get_seqs() # get a list of otu with their sequences
    
    for unit in otu_list:
        output_file = output_folder + '/' + unit[0] + '.txt'
        current_otu = input_map.detail_sample_unit(unit[0])
        
        print('\tWriting: {0} ...\r'.format(output_file, end='\r'))
        with open(output_file, 'wb') as f:
            for line in current_otu:
                line = '\t'.join([str(i) for i in line])
                f.write('%s\n' % line)
    
    print('All files wrote to the folder: {0}.'.format(output_folder))
Ejemplo n.º 18
0
def main(name_space):
    import argparse
    import textwrap
    from lib import File_IO
    import time
    import sys
    
    parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
                                     epilog=textwrap.dedent('''\
                                        ------------------------
                                        By Zewei Song
                                        University of Minnesota
                                        Dept. Plant Pathology
                                        [email protected]
                                        ------------------------'''))
    parser.add_argument("-i", "--input", help="Convert a FASTQ file.")
    parser.add_argument("-o", "--output", help="Name of the output FASTA file")
    #parser.add_argument("-q", "--qual", action="store_true", help="Output Qual file")
    args = parser.parse_args(name_space)
    
    fasta_file = args.output
    #qual = args.qual
    
    if args.input:
        fastq_file = args.input
        start = time.time()
        print("Loading %s ..." % fastq_file)
        fasta_content = File_IO.read_seqs(fastq_file, file_type='fastq', output='fasta')
        print('Converting to FASTA ...')
        record_num = File_IO.write_seqs(fasta_content, fasta_file, checker=False, overwrite=True)
        print("Converted %d records in %s ..." % (record_num, fastq_file))
    
        end = time.time()
        used_time = round(end - start, 2)
        print("It took %s sec to convert (%s seqs/s).\nFASTA file saved in %s." % (
            str(used_time), str(round(record_num/used_time,0)), fasta_file))
    #    if qual:
    #        print "Quality scores saved in %s." % (File_IO.name_file(fasta_file, '', 'qual'))
    
    else:
        print("Please specify a FASTQ file.")
        sys.exit()
Ejemplo n.º 19
0
def MainLabelFiles(mapping_file, input_folder, threads=1, output_folder='labeled', file_type='fastq',
                   label_type='both'):
    #Create a new folder for relabeled files    
    from lib import File_IO

    File_IO.mk_dir(output_folder)
    if threads == 1:
        print("Relabeling files using %d thread ..." % threads)
        mapping = ParseMapping(mapping_file, input_folder)
        file_num = len(mapping)
        for item in mapping:
            count = ReLabelFastQ(item['file'], item['label'], item['read_type'], item['input_folder'], \
                                 output_folder=output_folder, file_type=file_type, label_type=label_type)
            print("%s sequences in %s relabeled to %s as %s file.\n" % (
                count, item['file'], item['label'], item['read_type']))

    elif threads > 1:
        print("Relabeling files using %d threads ..." % threads)
        mapping_multithreads = SplitMapping(mapping_file, input_folder, output_folder=output_folder,
                                            file_type=file_type, label_type=label_type, processor=threads)
        file_num = sum([len(i) for i in mapping_multithreads])
        worker = CreateWorker(mapping_multithreads, threads=threads)

        for item in worker:
            # Start workers
            item.start()
        for item in worker:
            # Wait until all workers finishes
            item.join()

    else:
        print("The number of threads cannot be negative.")
        import sys

        sys.exit()
    return file_num

#%%
Ejemplo n.º 20
0
def main(name_space):
    import argparse
    import textwrap
    from lib import File_IO
    import sys
    
    parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
                                     epilog=textwrap.dedent('''\
                                        ------------------------
                                        By Zewei Song
                                        University of Minnesota
                                        Dept. Plant Pathology
                                        [email protected]
                                        ------------------------'''), prog = 'fast.py -count_seqs')
    parser.add_argument("-i", "--input", help="Name of the input sequence file")
    group = parser.add_mutually_exclusive_group()
    group.add_argument("-a", "--fasta", action="store_true", help="Set file type to FASTA")
    group.add_argument("-q", "--fastq", action="store_true", help="Set file type to FASTQ")
    args = parser.parse_args(name_space)
    
    seq_file = args.input
    
    if args.fasta:
        head_symbol = '>'
        seq_type = 'fasta'
    if args.fastq:
        head_symbol = "@"
        seq_type = "fastq"
    else:
        with open(seq_file, 'rU') as f:
            header = f.read(1)
        if header == '>':
            head_symbol = ">"
            seq_type = "fasta"
            print "File type set as FASTA."
        elif header == '@':
            head_symbol = '@'
            seq_type = 'fastq'
            print "File type set as FASTQ."
        else:
            print '%s is not a valid header for FASTA or FASTQ file.' % header
            sys.exit()
    seq_content = File_IO.read_seqs(seq_file, file_type=seq_type)
    seq_count = len(seq_content)
    print "%i records found in %s." % (seq_count, seq_file)
Ejemplo n.º 21
0
def main(name_space):
    import argparse
    import textwrap
    parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
                                     epilog=textwrap.dedent('''\
                                        ------------------------
                                        By Zewei Song
                                        University of Minnesota
                                        Dept. Plant Pathology
                                        [email protected]
                                        ------------------------'''), prog='fast.py -generate_fast_map')
    parser.add_argument('-map', help='Name of the Qiime style OTU/Derep map file')
    parser.add_argument('-seq', help='Name of the sequence file corresponding to the Qiime map.')
    parser.add_argument('-o', '--output', help='Name of the output FAST map.')
    group = parser.add_mutually_exclusive_group()
    group.add_argument('-derep', action='store_true', help='Indicate the source is a dereplication map.')
    group.add_argument('-otu', action='store_true', help='Indicate the source is an OTU map.')
    parser.add_argument('-separator', default = ';', help='Set the separator for parsing the sequence label.')
    
    args = parser.parse_args(name_space)
    
    input_map_file = args.map
    input_seq_file = args.seq
    output_map_file = args.output
    separator = args.separator
    
    if args.derep:
        real_sample = True
    elif args.otu:
        real_sample = False
    
    from lib import ParseOtuMap
    from lib import File_IO
    
    input_map = ParseOtuMap.read_otu_map(input_map_file)
    input_seq = File_IO.read_seqs(input_seq_file)
    
    output_map = ParseOtuMap.generate_fast_output(input_map, input_seq, real_sample = real_sample, separator=separator)
    
    ParseOtuMap.write_fast_output(output_map, output_map_file)
    
    print('FAST style map file wrote to %s.' %output_map_file)
Ejemplo n.º 22
0
def ReLabelFastQ(file_name, label, read_type, input_folder, output_folder='labeled', file_type='fastq',
                 label_type='qiime'):
    #%% Read in sequence file and change the header
    from lib import File_IO

    file_content = File_IO.read_seqs(input_folder + '/' + file_name, file_type=file_type)
    head_symbol = '@'
    if len(file_content[0]) == 2:
        head_symbol == '>'

    count = 0
    for record in file_content:  #Loop through header of the records
        record[0] = ChangeName(label, count, read_type, label_type=label_type)
        count += 1

    file_labeled = output_folder + '/labeled_' + file_name
    with open(file_labeled, 'w') as f:
        for record in file_content:
            record[0] = head_symbol + record[0]  # Add head symbol to sequence name
            for line in record:
                f.write('%s\n' % line)
    return count
Ejemplo n.º 23
0
def main(name_space):
    import argparse
    import textwrap
    from lib import ParseOtuMap
    from lib import File_IO
    #import sys

    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=textwrap.dedent('''\
                                    ------------------------
                                    By Zewei Song
                                    University of Minnesota
                                    Dept. Plant Pathology
                                    [email protected]
                                    ------------------------'''),
        prog='fast.py -make_otu_table')
    group = parser.add_mutually_exclusive_group()
    group.add_argument('-qiime_map', help='The Qiime style OTU map.')
    group.add_argument('-fast_map', help='The FAST hybrid OTU map.')
    parser.add_argument('-o', '--output', help='Output OTU table.')
    parser.add_argument(
        '-rep',
        help='Indicate to output a representative sequnce if using FAST method.'
    )
    args = parser.parse_args(name_space)

    if args.qiime_map != None:
        input_file = args.qiime_map
        method = 'qiime'
    elif args.fast_map != None:
        input_file = args.fast_map
        method = 'fast'

        if args.rep != None:
            output_seq_file = args.rep

    output_file = args.output

    # Parse OTU map into OTU table dictionary

    # Use Qiime style
    if method == 'qiime':
        print('Reading the Qiime style OTU map: {0} ...'.format(input_file))
        otu_map = ParseOtuMap.read_otu_map(input_file)
        sample_list = []
        otu_table_dict = {}
        for key, value in otu_map.items():
            otu_table_dict[key] = {}
            for sample in value:
                treatment = sample[:sample.find('_')]
                if treatment not in sample_list:
                    sample_list.append(treatment)
                try:
                    otu_table_dict[key][treatment] += 1
                except KeyError:
                    otu_table_dict[key][treatment] = 1

    # Use FAST style
    if method == 'fast':
        print('Reading the FAST hybrid OTU map: {0} ...'.format(input_file))
        otu_map = ParseOtuMap.read_fast_output(input_file)
        #        sample_list = []
        #        otu_table_dict = {}
        #        for otu, value in otu_map.items():
        #            otu_table_dict[otu] = {}
        #            for derep_unit, derep_value in value['sample'].items():
        #                for sample, abundance in derep_value['sample'].items():
        #                    treatment = sample
        #                    if treatment not in sample_list:
        #                        sample_list.append(treatment)
        #                    try:
        #                        otu_table_dict[otu][treatment] += abundance
        #                    except KeyError:
        #                        otu_table_dict[otu][treatment] = abundance
        otu_map_parser = ParseOtuMap.fast_output_parser(otu_map)
        sample_list, otu_table_dict = otu_map_parser.parse_otu_table()

        if args.rep != None:
            temp_content = otu_map_parser.get_seqs()
            rep_seq = []
            for item in temp_content:
                rep_seq.append(item[:2])
            rep_seq_count = File_IO.write_seqs(rep_seq,
                                               output_seq_file,
                                               checker=False,
                                               overwrite=True)
            print('{0} OTUs were wrote to {1}.'.format(rep_seq_count,
                                                       output_seq_file))

    # Convert OTU table dictionary to table


#    otu_abundance = {}
#    for sample in sample_list:
#        otu_abundance[sample] = 0  # Set initial abundance to zero as place holder
    sample_list.sort()
    otu_table = []
    for key, value in otu_table_dict.items():
        current_otu = [key]
        for sample in sample_list:
            try:
                current_otu.append(value[sample])
            except KeyError:
                current_otu.append(0)
        otu_table.append(current_otu)
    otu_table.sort(key=lambda x: sum(map(int, x[1:])), reverse=True)
    otu_table = [['OTU_ID'] + sample_list] + otu_table

    # Write OTU table to a new file
    sample_list = ['OTU_ID'] + sample_list
    with open(output_file, 'w') as f:
        for line in otu_table:
            line = [str(i) for i in line]
            f.write('%s\n' % '\t'.join(line))

    print('OTU table with {0} samples was saved in {1}.'.format(
        len(sample_list) - 1, output_file))
Ejemplo n.º 24
0
def main(name_space):
    import argparse
    import textwrap
    from lib import ParseOtuMap
    from lib import File_IO
    #import sys
    
    parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
                                     epilog=textwrap.dedent('''\
                                    ------------------------
                                    By Zewei Song
                                    University of Minnesota
                                    Dept. Plant Pathology
                                    [email protected]
                                    ------------------------'''), prog='fast.py -make_otu_table')
    group = parser.add_mutually_exclusive_group()
    group.add_argument('-qiime_map', help='The Qiime style OTU map.')
    group.add_argument('-fast_map', help='The FAST hybrid OTU map.')
    parser.add_argument('-o', '--output', help='Output OTU table.')
    parser.add_argument('-rep', help='Indicate to output a representative sequnce if using FAST method.')
    args = parser.parse_args(name_space)
    
    if args.qiime_map != None:
        input_file = args.qiime_map
        method = 'qiime'
    elif args.fast_map != None:
        input_file = args.fast_map
        method = 'fast'
        
        if args.rep != None:
            output_seq_file = args.rep
            
    output_file = args.output
    
    # Parse OTU map into OTU table dictionary
    
    # Use Qiime style
    if method == 'qiime':
        print('Reading the Qiime style OTU map: {0} ...'.format(input_file))
        otu_map = ParseOtuMap.read_otu_map(input_file)
        sample_list = []
        otu_table_dict = {}
        for key, value in otu_map.items():
            otu_table_dict[key] = {}
            for sample in value:
                treatment = sample[:sample.find('_')]
                if treatment not in sample_list:
                    sample_list.append(treatment)
                try:
                    otu_table_dict[key][treatment] += 1
                except KeyError:
                    otu_table_dict[key][treatment] = 1
    
    # Use FAST style
    if method == 'fast':
        print('Reading the FAST hybrid OTU map: {0} ...'.format(input_file))
        otu_map = ParseOtuMap.read_fast_output(input_file)
#        sample_list = []
#        otu_table_dict = {}
#        for otu, value in otu_map.items():
#            otu_table_dict[otu] = {}
#            for derep_unit, derep_value in value['sample'].items():
#                for sample, abundance in derep_value['sample'].items():
#                    treatment = sample
#                    if treatment not in sample_list:
#                        sample_list.append(treatment)
#                    try:
#                        otu_table_dict[otu][treatment] += abundance
#                    except KeyError:
#                        otu_table_dict[otu][treatment] = abundance
        otu_map_parser = ParseOtuMap.fast_output_parser(otu_map)
        sample_list, otu_table_dict = otu_map_parser.parse_otu_table()
        
        if args.rep != None:
            temp_content = otu_map_parser.get_seqs()
            rep_seq = []
            for item in temp_content:
                rep_seq.append(item[:2])
            rep_seq_count = File_IO.write_seqs(rep_seq, output_seq_file, checker=False, overwrite=True) 
            print('{0} OTUs were wrote to {1}.'.format(rep_seq_count, output_seq_file))
    
    # Convert OTU table dictionary to table
#    otu_abundance = {}
#    for sample in sample_list:
#        otu_abundance[sample] = 0  # Set initial abundance to zero as place holder
    sample_list.sort()
    otu_table = []
    for key, value in otu_table_dict.items():
        current_otu = [key]
        for sample in sample_list:
            try:
                current_otu.append(value[sample])
            except KeyError:
                current_otu.append(0)
        otu_table.append(current_otu)
    otu_table.sort(key=lambda x: sum(map(int, x[1:])), reverse=True)
    otu_table = [['OTU_ID'] + sample_list] + otu_table
    
    # Write OTU table to a new file
    sample_list = ['OTU_ID'] + sample_list
    with open(output_file, 'w') as f:
        for line in otu_table:
            line = [str(i) for i in line]
            f.write('%s\n' % '\t'.join(line))
    
    print('OTU table with {0} samples was saved in {1}.'.format(len(sample_list)-1, output_file))
Ejemplo n.º 25
0
def main(name_space):
    import argparse
    import textwrap
    from lib import random_subsample as rs
    from lib import ParseOtuMap
    from lib import Seq_IO
    from lib import File_IO
    
    parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
                                     epilog=textwrap.dedent('''\
                                    ------------------------
                                    By Zewei Song
                                    University of Minnesota
                                    Dept. Plant Pathology
                                    [email protected]
                                    ------------------------'''), prog='fast.py -pick_seqs')
    parser.add_argument('-i', '--input', help='Input FASTA file to be picked')
    parser.add_argument('-o', '--output', help='Name for output FASTA file.')
    group = parser.add_mutually_exclusive_group()
    group.add_argument('-map', help='OTU map file, will pick OTU names in default')
    group.add_argument('-name_list', help='File with names in separated lines')
    group.add_argument('-random_pick', help='Randomly pick the given number of sequences.')
    parser.add_argument('-sequence', action='store_true', help='Indicate to pick by sequence names instead of OTU names')
    parser.add_argument('-sizeout', action='store_true', help='Indicate to output size label.')
    args = parser.parse_args(name_space)
    
    input_fasta = args.input
    output_fasta = args.output
    pick_list = False
    
    print('\n')
    if args.map:
        pick_list = []
        if not args.sequence:
            otu_map = ParseOtuMap.read_otu_map(args.map)
            for key in otu_map:
                pick_list.append(key)
        elif args.sequence:
            otu_map = ParseOtuMap.read_otu_map(args.map)
            for key, value in otu_map.items():
                pick_list += value
        print('Picking sequences from the OTU map: %s.' % (args.map))
        print('Found %i names to be picked.' % len(pick_list))
    if args.name_list:
        pick_list = []
        with open(args.name_list, 'rU') as f:
            for line in f:
                pick_list.append(line.strip('\n'))
        print('Picking sequences from a OTU list.')
        print('Found %i names to be picked.' % len(pick_list))
    if args.random_pick:
        pick_size = int(args.random_pick)
        print('Randomly pick %i sequences.' % (pick_size))
        
    if pick_list == []:
        input_content = File_IO.read_seqs(input_fasta)
        print('Reaing in the original FASTA file: %s ...' % input_fasta)
        print('Randomly sampling %i sequences out of %i ...' %(pick_size, len(input_content)))
        seq_index = rs.generate_random_index(len(input_content), pick_size)
        sampled_content = []

        for index in seq_index:
            sampled_content.append(input_content[index])
        
        count = File_IO.write_seqs(sampled_content, output_fasta, checker=False, overwrite=True)
        print('Picked sequences wrote to %s.' % output_fasta)
    
    else:
        print('Reaing in the original FASTA file: %s ...' % input_fasta)
        input_content = File_IO.read_seqs(input_fasta)
        for record in input_content:
            record[0] = record[0].split(' ')[0]  # OTU name will be cut at the first space
            if record[0].find(';') != -1:
                record[0] = record[0][:record[0].find(';')] # Cut the label at the first ";"
        print('Indexing the original sequence file ...')
        input_dict = Seq_IO.make_dict(input_content)
        
        count_picked = 0
        count_missed = 0
        print('Search name list in the sequence file ...')
        picked_content = []
        
        if args.sizeout:
            print("Output size labels ...")
            size_list = []
            for record in pick_list:
                size_list.append([record, len(otu_map[record])])
            size_list = sorted(size_list, key=lambda x:x[1], reverse=True)
            for record in size_list:
                try:
                    new_label = record[0] + ';size=' + str(record[1]) 
                    picked_content.append([new_label, input_dict[record[0]][0]])
                    count_picked += 1
                except KeyError:
                    count_missed += 1
        
        else:
            for name in pick_list:
                try:
                    picked_content.append([name, input_dict[name][0]])
                    count_picked += 1
                except KeyError:
                    count_missed += 1
      
        print('Finished searching.')
        print('Original sequence=%i' % len(input_content))
        print('Input names=%i' % len(pick_list))
        print('Picked sequences=%i' % count_picked)
        print('Not found sequences=%i' % count_missed)
        
        print('Writing to a new FASTA file ...')
        count = File_IO.write_seqs(picked_content, output_fasta, checker=False, overwrite=True)
        print('Picked sequences wrote to %s.' % output_fasta)
Ejemplo n.º 26
0
def main(name_space):
    from lib import random_subsample as rs
    from lib import ParseOtuTable
    from lib import File_IO
    import argparse
    import textwrap
    import time

    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=textwrap.dedent('''\
                                    ------------------------
                                    By Zewei Song
                                    University of Minnesota
                                    Dept. Plant Pathology
                                    [email protected]
                                    ------------------------'''),
        prog='fast.py -rarefy_otu_table')
    parser.add_argument('-otu', help='Input OTU table')
    parser.add_argument('-o', '--output', help='Output OTU table')
    parser.add_argument('-d', '--depth', help='Sampling depth for each sample')
    parser.add_argument('-iter',
                        default=1,
                        help='Iteration time for each sample')
    parser.add_argument('-thread', default=1, help='Number of threads')
    parser.add_argument('-keep_all',
                        action='store_true',
                        help='Indicate to keep all samples')
    parser.add_argument('-meta_column',
                        default='taxonomy',
                        help='Name of the first meta data')
    args = parser.parse_args(name_space)

    input_otu = args.otu

    iter_num = int(args.iter)
    thread = int(args.thread)
    meta_col = args.meta_column
    if args.output:
        output_otu = args.output
    else:
        output_otu = File_IO.name_file(input_otu, '', 'rare')

    otu_table = ParseOtuTable.parser_otu_table(input_otu, meta_col=meta_col)
    input_sample = otu_table.sample_matrix
    start = time.time()

    print('Input OTU table: %s' % input_otu)
    if args.depth:
        depth = int(args.depth)
        print('Sampling depth: %i' % depth)
    else:
        depth = min([sum(i[1:]) for i in input_sample])
        print('Sampling depth set to the minimum abundance: %i' % depth)
    print('Iteration time for each OTU: %i' % iter_num)
    print('Threads number: %i' % thread)
    print('Reading in the OTU table ...')

    if args.keep_all:
        count = 0
        for line in input_sample:
            if sum(line[1:]) < depth:
                count += 1
        print('Found %i samples in the OTU table.' % len(input_sample))
        print(
            '%i samples has total abundance less than the sampling depth, but will be kept in the output.'
            % count)
    else:
        temp = []
        count = 0
        for line in input_sample:
            if sum(line[1:]) >= depth:
                temp.append(line)
            else:
                count += 1
        input_sample = temp
        print('Found %i samples in the OTU table.' % len(input_sample))
        print(
            '%i samples has total abundance less than the sampling depth, and will be excluded.'
            % (count))

    otu_id = otu_table.species_id
    otu_table_rarefied = [['OTU_ID'] + otu_id + otu_table.meta_id]

    for sample in input_sample:
        print('Rarefying %s ...' % sample[0])
        repeat_sample = rs.repeat_rarefaction_parallel(sample[1:],
                                                       depth,
                                                       iter_num,
                                                       processor=thread)
        repeat_sample.sort(key=lambda x: sum(i > 0 for i in x))
        repeat_sample = [sample[0]] + repeat_sample[int(
            iter_num /
            2)]  # Pick the rarefied sample with the average richness
        otu_table_rarefied.append(repeat_sample[:])

    otu_table_rarefied = [list(i) for i in zip(*otu_table_rarefied)]

    # Add meta data
    meta_data = otu_table.meta_dict()
    for line in otu_table_rarefied[1:]:
        for key in otu_table.meta_id:
            line.append(meta_data[key][line[0]])
    for key in otu_table.meta_id:
        otu_table_rarefied[0].append(key)

    with open(output_otu, 'w') as f:
        for line in otu_table_rarefied:
            line = [str(i) for i in line]
            f.write('%s\n' % '\t'.join(line))
    print('Rarefied OTU table saved in %s.' % output_otu)
    end = time.time()
    used_time = round(float(end - start), 2)
    time_per_sample = round(used_time / len(input_sample), 2)
    print('Total time used: %s seconds (%s seconds per sample)' %
          (str(used_time), str(time_per_sample)))
Ejemplo n.º 27
0
def main(Namespace):
    import argparse
    import textwrap

    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=textwrap.dedent('''\
                                    ------------------------
                                    By Zewei Song
                                    University of Minnesota
                                    Dept. Plant Pathology
                                    [email protected]
                                    ------------------------'''),
        prog='fast.py -dereplicate')
    parser.add_argument('-i',
                        '--input',
                        help='Input FASTA file to be dereplicated.')
    parser.add_argument('-o',
                        '--output',
                        help='Name for output OTU map and FASTA file.')
    parser.add_argument('-t',
                        '--thread',
                        default=1,
                        help='Number of threads to be used.')
    parser.add_argument('-fast',
                        default="",
                        help="Name of FAST style output file.")
    parser.add_argument(
        '-sizeout',
        action='store_true',
        help='Specify to add a USEARCH style size label: ";szie=XXX"')

    args = parser.parse_args(Namespace)

    input_file = args.input
    output_name = args.output
    output_map = output_name + '.txt'
    output_fasta = output_name + '.fasta'

    thread = int(args.thread)

    import time
    from lib import File_IO
    from multiprocessing import Process, Manager
    import sys

    print('Using %i threads ...' % thread)
    start = time.time()

    input_file = input_file
    print('Loading %s ...' % input_file)
    seqs = File_IO.read_seqs(input_file)
    seqs_num = len(seqs)
    print('Read in %i sequences.' % seqs_num)

    # Disable multiprocess if using single thread
    if thread == 1:
        derep_dict = dereplicate_single_thread(seqs)
    else:
        # Separated seqs into pools
        print('Separating raw sequences into %d jobs ...' % thread)
        d = divide_seqs(seqs_num, thread)

        # Create shared list for store dereplicated dict and progress counter
        manager = Manager()
        derep_dict = manager.list([{}] * thread)
        count = manager.list([0] * thread)

        print('Starting dereplicating ...')
        workers = []
        for i in range(thread):
            current_range = d[i]
            workers.append(
                Process(target=dereplicate_worker,
                        args=(seqs[current_range[0]:current_range[1]],
                              derep_dict, i, count)))
        del seqs

        print('Starting %i jobs ...' % thread)
        count_worker = 1
        for job in workers:
            job.start()
            print('Starting thread No. %i ...' % count_worker)
            count_worker += 1

        job_alive = True
        while job_alive:
            time.sleep(0.01)
            job_alive = False
            for job in workers:
                if job.is_alive():
                    job_alive = True
            #progress = "Dereplicating: " + str(round(sum(count)/float(seqs_num)*100,2)) + "%" + "\r"
            #sys.stderr.write(progress)

        for derep_worker in workers:
            derep_worker.join()
        print('Finished dereplicating.')
        seqs = []  # Empty sequences list to free memory.

    # Merged dereplicated dictionaries into a single dict
    sys.stderr.write('\n')

    if thread > 1:
        sys.stderr.write('Merging %i dictionaries into one ...' %
                         len(derep_dict))
        merged_dict = {}
        count = 0
        for d in derep_dict:
            for key, value in d.items():
                count += 1
                try:
                    merged_dict[key] += value
                except KeyError:
                    merged_dict[key] = value
                #sys.stderr.write('Merging %i sequence ...' % count + '\b' * 50,)
            derep_dict[0] = ''  # Empty finished dictionary to free memory.
    else:
        merged_dict = derep_dict
    print
    print("Sequences dereplicated, clasped from %i into %i sequences." %
          (seqs_num, len(merged_dict)))
    s = [len(merged_dict[i]) for i in merged_dict]
    print('Dereplicated OTU size: Max=%i, Min=%i, Average=%i.' %
          (max(s), min(s), round(float(sum(s) / len(s)), 2)))
    end = time.time()
    print("Used time: " + str(end - start) + ' seconds.')
    print

    # Name the dereplicated group
    size_list = sorted([[len(merged_dict[i]), i] for i in merged_dict],
                       reverse=True)
    count = 0
    for element in size_list:
        derep_name = 'derep_' + str(count)
        element.append(derep_name)
        count += 1

    # Output dereplicated FASTA file
    print('Writing dereplicated sequence and OTU map ...')
    output_seq_file = output_fasta
    with open(output_seq_file, 'w') as f:
        if args.sizeout:
            for element in size_list:
                output_label = element[2] + ";size=" + str(element[0])
                f.write('>%s\n' % output_label)
                f.write('%s\n' % element[1])

        else:
            for element in size_list:
                output_label = element[2]
                f.write('>%s\n' % output_label)
                f.write('%s\n' % element[1])

    print('%s contains dereplicated sequences.' % output_fasta)

    # Output Qiime style map
    with open(output_map, 'w') as f:
        for element in size_list:
            name_list = merged_dict[element[1]]
            f.write(
                '%s\t%s\n' %
                (element[2],
                 '\t'.join(name_list)))  # Use the last element as group name
    print('%s contains an OTU map for dereplicated sequences.' % output_map)

    # Generate FAST style derep output file (a single file with sample names, counts, and dereplicated sequences)
    if args.fast != "":
        fast_file = args.fast

        fast_dict = {}

        for element in size_list:

            fast_dict[element[2]] = {
            }  # Crearte a new dict for current derep unit
            fast_dict[element[2]]['seq'] = element[
                1]  # Save dereplicated sequence

            sample_dict = {}  # Create a dict for sample sequence count
            name_list = merged_dict[element[1]]
            for sample in name_list:
                current_sample = get_treatment(sample)
                try:
                    sample_dict[current_sample] += 1
                except KeyError:
                    sample_dict[current_sample] = 1

            fast_dict[element[2]]['sample'] = sample_dict

        import json
        json.dump(fast_dict, open(fast_file, "wb"))
Ejemplo n.º 28
0
def main(Namespace):
    from lib import File_IO
    from lib import Seq_IO
    import argparse
    import textwrap
    import sys
    import time

    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=textwrap.dedent('''\
                                    ------------------------
                                    By Zewei Song
                                    University of Minnesota
                                    Dept. Plant Pathology
                                    [email protected]
                                    ------------------------'''),
        prog='fast.py -filter_seqs')
    parser.add_argument('-i',
                        '--input',
                        help='Name of the input file, can be FASTA or FASTQ')
    parser.add_argument('-o', '--output', help='Name of the output file')
    parser.add_argument('-maxN', help='Number of maximum ambiguous base')
    parser.add_argument('-maxhomop', help='Maximum length of homopolyer')
    args = parser.parse_args(Namespace)

    start = time.time()
    seq_file = args.input
    filtered_file = args.output

    print('Reading in %s ...' % seq_file)
    seqs = File_IO.read_seqs(seq_file)
    count_total = len(seqs)
    print('Found %d sequences.' % count_total)

    if len(seqs[0]) == 2:
        seqs_type = 'fasta'
    elif len(seqs[0]) == 4:
        seqs_type = 'fastq'
    else:
        print(
            'This is not a corerct FASTA or FASTQ file, please check you file.'
        )
        sys.exit()
    #
    checkN = False
    check_homop = False
    if args.maxN:
        maxN = int(args.maxN)
        checkN = True
        print('Maximum ambiguous base allowed: %d' % maxN)
    if args.maxhomop:
        maxhomop = int(args.maxhomop)
        check_homop = True
        print('Maximum length of homopolyer: %d' % maxhomop)
    else:
        pass
    checker = 0
    if checkN and check_homop:
        checker = 12
    elif checkN:
        checker = 1
    elif check_homop:
        checker = 2

    seqs_filtered = []
    count_pass = 0
    count_total = 0

    if checker == 12:
        for record in seqs:
            count_total += 1
            #sys.stderr.write('Processing %i sequence ...' % count_total + '\b' * 100,)
            current_record = record[1]
            if not Seq_IO.check_ambiguous(current_record, maxN):
                if not Seq_IO.check_homop(current_record, maxhomop + 1):
                    seqs_filtered.append(record)
                    count_pass += 1

    if checker == 1:
        for record in seqs:
            count_total += 1
            #sys.stderr.write('Processing %i sequence ...' % count_total + '\b' * 100,)
            current_record = record[1]
            if not Seq_IO.check_ambiguous(current_record, maxN):
                seqs_filtered.append(record)
                count_pass += 1

    if checker == 2:
        for record in seqs:
            count_total += 1
            #sys.stderr.write('Processing %i sequence ...' % count_total + '\b' * 100,)
            current_record = record[1]
            if not Seq_IO.check_homop(current_record, maxhomop + 1):
                seqs_filtered.append(record)
                count_pass += 1
    end = time.time()
    used_time = round(float(end - start), 2)
    print

    print(
        'Filtered %d sequences, %d (%s%%) passed. Used %s seconds.' %
        (count_total, count_pass,
         str(round(float(count_pass) / count_total, 1) * 100), str(used_time)))

    print('Writing to %s ...' % filtered_file)
    count = File_IO.write_seqs(seqs_filtered,
                               filtered_file,
                               checker=False,
                               overwrite=True)
    print('Filtered sequences (%i seqs) store in %s' % (count, filtered_file))
Ejemplo n.º 29
0
        seqs_divide.append(size)
    seqs_divide[0] += total % thread_num
    return seqs_divide


if __name__ == '__main__':
    import time
    from lib import File_IO
    from multiprocessing import Process, Manager
    import os, sys

    print 'Using %i threads ...' % thread

    input_file = input_file
    print 'Loading %s ...' % input_file
    seqs = File_IO.read_seqs(input_file)
    seqs_num = len(seqs)
    print 'Read in %i sequences.' % seqs_num

    # Separated seqs into pools
    print 'Separating raw sequences into %d jobs ...' % thread
    d = divide_seqs(seqs_num, thread)

    start = time.time()
    # Create shared list for store dereplicated dict and progress counter
    manager = Manager()
    count = manager.list([0] * thread)

    print 'Starting dereplicating ...'
    workers = []
    for i in range(thread):
Ejemplo n.º 30
0
def main(name_space):
    from lib import random_subsample as rs
    from lib import File_IO
    import argparse
    import textwrap
    import sys
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=textwrap.dedent('''\
                                        ------------------------
                                        By Zewei Song
                                        University of Minnesota
                                        Dept. Plant Pathology
                                        [email protected]
                                        ------------------------'''),
        prog='fast.py -random_subsample')
    parser.add_argument('-r1', help='Name of the Read1 file.')
    parser.add_argument('-r2', help='Name of the Read2 file if applicable.')
    parser.add_argument('-size',
                        default=10000,
                        help='Sampling size for each file, default=10,000.')

    args = parser.parse_args(name_space)

    read1 = args.r1
    if args.r2:
        read2 = args.r2
    sample_size = int(args.size)

    read1_content = File_IO.read_seqs(read1)
    total_size = len(read1_content)

    file_type = "fasta"
    if read1_content[0][2] == "+":
        file_type = "fastq"

    if sample_size > total_size:
        print(
            'The specified sampling size is larger than the total number of sequences.'
        )
        sys.exit()
    else:
        seq_index = rs.generate_random_index(total_size, sample_size)

    # Get sequences in read1 file
    read1_picked = []
    for index in seq_index:
        read1_picked.append(read1_content[index])

    # Pick read1 file is the filename is specified
    if args.r2:
        read2_content = File_IO.read_seqs(read2)
        read2_picked = []
        for index in seq_index:
            read2_picked.append(read2_content[index])

    # write to new files
    read1_output = "R1." + file_type
    read1_count = File_IO.write_seqs(read1_picked,
                                     read1_output,
                                     checker=False,
                                     overwrite=True)
    print(
        '{0} sequences have been randomly picked from {1}, and saved in {2}.'.
        format(read1_count, read1, read1_output))
    if args.r2:
        read2_output = "R2." + file_type
        read2_count = File_IO.write_seqs(read2_picked,
                                         read2_output,
                                         checker=False,
                                         overwrite=True)
        print(
            '{0} sequences have been randomly picked from {1}, and saved in {2}.'
            .format(read2_count, read2, read2_output))
Ejemplo n.º 31
0
def main(name_space):
    import argparse
    import textwrap
    from lib import File_IO

    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=textwrap.dedent('''\
                                        ------------------------
                                        By Zewei Song
                                        University of Minnesota
                                        Dept. Plant Pathology
                                        [email protected]
                                        ------------------------'''),
        prog='fast.py -truncate_seqs')

    parser.add_argument("-i", "--input", help="Name of the input FASTA file.")
    group = parser.add_mutually_exclusive_group()
    group.add_argument('-fixed_length',
                       help='A fixed length to cut on all sequences.')
    group.add_argument(
        '-slice',
        help=
        'Slice size to cut from head and tail of each sequence in the format of "head,tail".'
    )
    parser.add_argument('-sliced_out',
                        action='store_true',
                        help='Indicate to output sliced sequences.')
    parser.add_argument("-o", "--output", help="Name of the output file.")
    args = parser.parse_args(name_space)

    if args.fixed_length:
        truncate_length = int(args.fixed_length)
        sequences = File_IO.read_seqs(args.input)
        count = len(sequences)
        print("Reading in %s ..." % args.input)
        print("%s contains %i records." % (args.input, count))
        print("Cutting sequences to a fixed length: %i ..." % truncate_length)

        count_fail = 0
        with open(args.output, 'w') as f:
            for record in sequences:
                if len(record[1]) >= truncate_length:
                    if len(record) == 2:
                        f.write('>%s\n' % record[0])
                        f.write('%s\n' % record[1][:truncate_length])
                    elif len(record) == 4:
                        f.write('@%s\n' % record[0])
                        f.write('%s\n' % record[1][:truncate_length])
                        f.write('%s\n' % record[2])
                        f.write('%s\n' % record[3][:truncate_length])
                else:
                    count_fail += 1
        print("%i sequences were cut to %i and save in %s." %
              (count - count_fail, truncate_length, args.output))

    if args.slice:
        slice_window = args.slice.split(',')
        head = int(slice_window[0])
        tail = int(slice_window[1])
        sequences = File_IO.read_seqs(args.input)
        count = len(sequences)
        print("Reading in %s ..." % args.input)
        print("%s contains %i records." % (args.input, count))
        print("Slicing %i bp from the head and %i bp from the tail ..." %
              (head, tail))

        count_fail = 0
        with open(args.output, 'w') as f:
            for record in sequences:
                seq_len = len(record[1])
                if seq_len > head + tail:
                    if len(record) == 2:
                        f.write('>%s\n' % record[0])
                        f.write('%s\n' % record[1][head:(seq_len - tail)])
                    elif len(record) == 4:
                        f.write('@%s\n' % record[0])
                        f.write('%s\n' % record[1][head:(seq_len - tail)])
                        f.write('%s\n' % record[2])
                        f.write('%s\n' % record[3][head:(seq_len - tail)])
                else:
                    count_fail += 1
        print("%i sequences were sliced and save in %s." %
              (count - count_fail, args.output))

        if args.sliced_out:
            if head > 0:
                head_output = 'head.' + args.output
                with open(head_output, 'wb') as f:
                    for record in sequences:
                        if len(record) == 2:
                            f.write('>%s\n' % record[0])
                            f.write('%s\n' % record[1][:head])
                        elif len(record) == 4:
                            f.write('@%s\n' % record[0])
                            f.write('%s\n' % record[1][:head])
                            f.write('%s\n' % record[2])
                            f.write('%s\n' % record[3][:head])
                print('The sliced head sequences wrote to %s.' % (head_output))

            if tail > 0:
                tail_output = 'tail.' + args.output
                with open(tail_output, 'wb') as f:
                    for record in sequences:
                        seq_len = len(record[1])
                        if len(record) == 2:
                            f.write('>%s\n' % record[0])
                            f.write('%s\n' % record[1][(seq_len - tail):])
                        elif len(record) == 4:
                            f.write('@%s\n' % record[0])
                            f.write('%s\n' % record[1][(seq_len - tail):])
                            f.write('%s\n' % record[2])
                            f.write('%s\n' % record[3][(seq_len - tail):])
                print('The sliced tail sequences wrote to %s.' % (tail_output))
Ejemplo n.º 32
0
def main(Namespace):
    import argparse
    import textwrap

    parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
                                     epilog=textwrap.dedent('''\
                                    ------------------------
                                    By Zewei Song
                                    University of Minnesota
                                    Dept. Plant Pathology
                                    [email protected]
                                    ------------------------'''), prog='fast.py -dereplicate')
    parser.add_argument('-i', '--input', help='Input FASTA file to be dereplicated.')
    parser.add_argument('-o', '--output', help='Name for output OTU map and FASTA file.')
    parser.add_argument('-t', '--thread', default = 1, help='Number of threads to be used.')
    parser.add_argument('-fast', default = "", help="Name of FAST style output file.")
    parser.add_argument('-sizeout', action = 'store_true', help='Specify to add a USEARCH style size label: ";szie=XXX"')

    args = parser.parse_args(Namespace)

    input_file = args.input
    output_name = args.output
    output_map = output_name + '.txt'
    output_fasta = output_name + '.fasta'

    thread = int(args.thread)

    import time
    from lib import File_IO
    from multiprocessing import Process, Manager
    import sys

    print('Using %i threads ...' % thread)
    start = time.time()

    input_file = input_file
    print('Loading %s ...' % input_file)
    seqs = File_IO.read_seqs(input_file)
    seqs_num = len(seqs)
    print('Read in %i sequences.' % seqs_num)

    # Disable multiprocess if using single thread
    if thread == 1:
        derep_dict = dereplicate_single_thread(seqs)
    else:
        # Separated seqs into pools
        print('Separating raw sequences into %d jobs ...' % thread)
        d = divide_seqs(seqs_num, thread)


        # Create shared list for store dereplicated dict and progress counter
        manager = Manager()
        derep_dict = manager.list([{}] * thread)
        count = manager.list([0] * thread)

        print('Starting dereplicating ...')
        workers = []
        for i in range(thread):
            current_range = d[i]
            workers.append(Process(target=dereplicate_worker,
                                   args=(seqs[current_range[0]:current_range[1]], derep_dict, i, count)))
        del seqs

        print('Starting %i jobs ...' % thread)
        count_worker = 1
        for job in workers:
            job.start()
            print('Starting thread No. %i ...' % count_worker)
            count_worker += 1

        job_alive = True
        while job_alive:
            time.sleep(0.01)
            job_alive = False
            for job in workers:
                if job.is_alive():
                    job_alive = True
            #progress = "Dereplicating: " + str(round(sum(count)/float(seqs_num)*100,2)) + "%" + "\r"
            #sys.stderr.write(progress)

        for derep_worker in workers:
            derep_worker.join()
        print('Finished dereplicating.')
        seqs = []  # Empty sequences list to free memory.

    # Merged dereplicated dictionaries into a single dict
    sys.stderr.write('\n')

    if thread > 1:
        sys.stderr.write('Merging %i dictionaries into one ...' % len(derep_dict))
        merged_dict = {}
        count = 0
        for d in derep_dict:
            for key, value in d.items():
                count += 1
                try:
                    merged_dict[key] += value
                except KeyError:
                    merged_dict[key] = value
                #sys.stderr.write('Merging %i sequence ...' % count + '\b' * 50,)
            derep_dict[0] = ''  # Empty finished dictionary to free memory.
    else:
        merged_dict = derep_dict
    print
    print("Sequences dereplicated, clasped from %i into %i sequences." % (seqs_num, len(merged_dict)))
    s = [len(merged_dict[i]) for i in merged_dict]
    print('Dereplicated OTU size: Max=%i, Min=%i, Average=%i.' % (max(s), min(s), round(float(sum(s) / len(s)), 2)))
    end = time.time()
    print("Used time: " + str(end - start) + ' seconds.')
    print


    # Name the dereplicated group
    size_list = sorted([[len(merged_dict[i]), i] for i in merged_dict], reverse=True)
    count = 0
    for element in size_list:
        derep_name = 'derep_' + str(count)
        element.append(derep_name)
        count += 1


    # Output dereplicated FASTA file
    print('Writing dereplicated sequence and OTU map ...')
    output_seq_file = output_fasta
    with open(output_seq_file, 'w') as f:
        if args.sizeout:
            for element in size_list:
                output_label = element[2] + ";size=" + str(element[0])
                f.write('>%s\n' % output_label)
                f.write('%s\n' % element[1])

        else:
            for element in size_list:
                output_label = element[2]
                f.write('>%s\n' % output_label)
                f.write('%s\n' % element[1])

    print('%s contains dereplicated sequences.' % output_fasta)

    # Output Qiime style map
    with open(output_map, 'w') as f:
        for element in size_list:
            name_list = merged_dict[element[1]]
            f.write('%s\t%s\n' % (element[2], '\t'.join(name_list)))  # Use the last element as group name
    print('%s contains an OTU map for dereplicated sequences.' % output_map)

    # Generate FAST style derep output file (a single file with sample names, counts, and dereplicated sequences)
    if args.fast != "":
        fast_file = args.fast

        fast_dict = {}

        for element in size_list:

            fast_dict[element[2]] = {} # Crearte a new dict for current derep unit
            fast_dict[element[2]]['seq'] = element[1] # Save dereplicated sequence

            sample_dict = {} # Create a dict for sample sequence count
            name_list = merged_dict[element[1]]
            for sample in name_list:
                current_sample = get_treatment(sample)
                try:
                    sample_dict[current_sample] += 1
                except KeyError:
                    sample_dict[current_sample] = 1

            fast_dict[element[2]]['sample'] = sample_dict

        import json
        json.dump(fast_dict, open(fast_file, "wb"))
Ejemplo n.º 33
0
def main(Namespace):
    from lib import File_IO
    from lib import Seq_IO
    import argparse
    import textwrap
    import sys
    import time

    parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
                                     epilog=textwrap.dedent('''\
                                    ------------------------
                                    By Zewei Song
                                    University of Minnesota
                                    Dept. Plant Pathology
                                    [email protected]
                                    ------------------------'''), prog='fast.py -filter_seqs')
    parser.add_argument('-i', '--input', help='Name of the input file, can be FASTA or FASTQ')
    parser.add_argument('-o', '--output', help='Name of the output file')
    parser.add_argument('-maxN', help='Number of maximum ambiguous base')
    parser.add_argument('-maxhomop', help='Maximum length of homopolyer')
    args = parser.parse_args(Namespace)

    start = time.time()
    seq_file = args.input
    filtered_file = args.output

    print('Reading in %s ...' % seq_file)
    seqs = File_IO.read_seqs(seq_file)
    count_total = len(seqs)
    print('Found %d sequences.' % count_total)

    if len(seqs[0]) == 2:
        seqs_type = 'fasta'
    elif len(seqs[0]) == 4:
        seqs_type = 'fastq'
    else:
        print('This is not a corerct FASTA or FASTQ file, please check you file.')
        sys.exit()
    #
    checkN = False
    check_homop = False
    if args.maxN:
        maxN = int(args.maxN)
        checkN = True
        print('Maximum ambiguous base allowed: %d' % maxN)
    if args.maxhomop:
        maxhomop = int(args.maxhomop)
        check_homop = True
        print('Maximum length of homopolyer: %d' % maxhomop)
    else:
        pass
    checker = 0
    if checkN and check_homop:
        checker = 12
    elif checkN:
        checker = 1
    elif check_homop:
        checker = 2

    seqs_filtered = []
    count_pass = 0
    count_total = 0

    if checker == 12:
        for record in seqs:
            count_total += 1
            #sys.stderr.write('Processing %i sequence ...' % count_total + '\b' * 100,)
            current_record = record[1]
            if not Seq_IO.check_ambiguous(current_record, maxN):
                if not Seq_IO.check_homop(current_record, maxhomop + 1):
                    seqs_filtered.append(record)
                    count_pass += 1

    if checker == 1:
        for record in seqs:
            count_total += 1
            #sys.stderr.write('Processing %i sequence ...' % count_total + '\b' * 100,)
            current_record = record[1]
            if not Seq_IO.check_ambiguous(current_record, maxN):
                seqs_filtered.append(record)
                count_pass += 1

    if checker == 2:
        for record in seqs:
            count_total += 1
            #sys.stderr.write('Processing %i sequence ...' % count_total + '\b' * 100,)
            current_record = record[1]
            if not Seq_IO.check_homop(current_record, maxhomop + 1):
                seqs_filtered.append(record)
                count_pass += 1
    end = time.time()
    used_time = round(float(end - start), 2)
    print

    print ('Filtered %d sequences, %d (%s%%) passed. Used %s seconds.' % (
        count_total, count_pass, str(round(float(count_pass) / count_total, 1) * 100), str(used_time)))

    print('Writing to %s ...' % filtered_file)
    count = File_IO.write_seqs(seqs_filtered, filtered_file, checker=False, overwrite=True)
    print('Filtered sequences (%i seqs) store in %s' % (count, filtered_file))
Ejemplo n.º 34
0
def main(name_space):
    import argparse
    import textwrap
    from lib import File_IO

    parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
                                     epilog=textwrap.dedent('''\
                                        ------------------------
                                        By Zewei Song
                                        University of Minnesota
                                        Dept. Plant Pathology
                                        [email protected]
                                        ------------------------'''), prog = 'fast.py -truncate_seqs')

    parser.add_argument("-i", "--input", help="Name of the input FASTA file.")
    group = parser.add_mutually_exclusive_group()
    group.add_argument('-fixed_length', help='A fixed length to cut on all sequences.')
    group.add_argument('-slice', help='Slice size to cut from head and tail of each sequence in the format of "head,tail".')
    parser.add_argument('-sliced_out', action='store_true', help='Indicate to output sliced sequences.')
    parser.add_argument("-o", "--output", help="Name of the output file.")
    args = parser.parse_args(name_space)

    if args.fixed_length:
        truncate_length = int(args.fixed_length)
        sequences = File_IO.read_seqs(args.input)
        count = len(sequences)
        print("Reading in %s ..." % args.input)
        print("%s contains %i records." % (args.input, count))
        print("Cutting sequences to a fixed length: %i ..." % truncate_length)

        count_fail = 0
        with open(args.output, 'w') as f:
            for record in sequences:
                if len(record[1]) >= truncate_length:
                    if len(record) == 2:
                        f.write('>%s\n' % record[0])
                        f.write('%s\n' % record[1][:truncate_length])
                    elif len(record) == 4:
                        f.write('@%s\n' % record[0])
                        f.write('%s\n' % record[1][:truncate_length])
                        f.write('%s\n' % record[2])
                        f.write('%s\n' % record[3][:truncate_length])
                else:
                    count_fail += 1
        print("%i sequences were cut to %i and save in %s." % (count - count_fail, truncate_length, args.output))

    if args.slice:
        slice_window = args.slice.split(',')
        head = int(slice_window[0])
        tail = int(slice_window[1])
        sequences = File_IO.read_seqs(args.input)
        count = len(sequences)
        print("Reading in %s ..." % args.input)
        print("%s contains %i records." % (args.input, count))
        print("Slicing %i bp from the head and %i bp from the tail ..." % (head, tail))

        count_fail = 0
        with open(args.output, 'w') as f:
            for record in sequences:
                seq_len = len(record[1])
                if seq_len > head + tail:
                    if len(record) == 2:
                        f.write('>%s\n' % record[0])
                        f.write('%s\n' % record[1][head:(seq_len - tail)])
                    elif len(record) == 4:
                        f.write('@%s\n' % record[0])
                        f.write('%s\n' % record[1][head:(seq_len - tail)])
                        f.write('%s\n' % record[2])
                        f.write('%s\n' % record[3][head:(seq_len - tail)])
                else:
                    count_fail += 1
        print("%i sequences were sliced and save in %s." % (count - count_fail, args.output))

        if args.sliced_out:
            if head > 0:
                head_output = 'head.' + args.output
                with open(head_output, 'wb') as f:
                    for record in sequences:
                        if len(record) == 2:
                            f.write('>%s\n' % record[0])
                            f.write('%s\n' % record[1][:head])
                        elif len(record) == 4:
                            f.write('@%s\n' % record[0])
                            f.write('%s\n' % record[1][:head])
                            f.write('%s\n' % record[2])
                            f.write('%s\n' % record[3][:head])
                print('The sliced head sequences wrote to %s.' % (head_output))

            if tail > 0:
                tail_output = 'tail.' + args.output
                with open(tail_output, 'wb') as f:
                    for record in sequences:
                        seq_len = len(record[1])
                        if len(record) == 2:
                            f.write('>%s\n' % record[0])
                            f.write('%s\n' % record[1][(seq_len - tail):])
                        elif len(record) == 4:
                            f.write('@%s\n' % record[0])
                            f.write('%s\n' % record[1][(seq_len - tail):])
                            f.write('%s\n' % record[2])
                            f.write('%s\n' % record[3][(seq_len - tail):])
                print('The sliced tail sequences wrote to %s.' % (tail_output))
Ejemplo n.º 35
0
def main(Namespace):
    import argparse
    import textwrap
    from lib import File_IO
    from lib import Seq_IO

    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=textwrap.dedent('''\
                                    ------------------------
                                    By Zewei Song
                                    University of Minnesota
                                    Dept. Plant Pathology
                                    [email protected]
                                    ------------------------'''),
        prog='fast.py -nucl_freq')
    parser.add_argument('-i',
                        '--input',
                        help='Name of the input FASTA or FASTQ file.')
    parser.add_argument('-o',
                        '--output',
                        default='nucl_report.txt',
                        help='Name of the reporting file.')
    parser.add_argument(
        '-tail',
        action='store_true',
        help='Indicate to also count from the tail of the sequences.')
    args = parser.parse_args(Namespace)

    input_file = args.input
    output_file = args.output
    tail_indicator = args.tail

    print('Reading in file: {0} ...'.format(input_file))
    input_seq = File_IO.read_seqs(input_file)
    print('The file contains {0} sequences.'.format(len(input_seq)))
    if tail_indicator:
        print(
            'Counting nucleotide frequencies from both ends of all sequences...'
        )
    else:
        print(
            'Counting nucleotide frequencies from the head of all sequences...'
        )

    nucl_freq, unidentified_count = Seq_IO.nucl_freq(input_seq,
                                                     tail=tail_indicator)

    nucl_list = ['A', 'T', 'C', 'G', 'N']

    # Output the counting result
    header = 'Position\tA\tT\tC\tG\tN\tMost frequent\tFrequency'
    output_content = [header]
    for pos in range(len(nucl_freq)):

        # Get the most frequent nucleotide at this position:
        temp_list = []
        most_freq_nucl = ""
        for nucl in nucl_list:
            temp_list.append([nucl_freq[pos][nucl], nucl])
        temp_list.sort(reverse=True)
        most_freq_nucl = temp_list[0][1]
        sum_nucl_count = sum(nucl_freq[pos].values())
        most_freq_nucl_freq = float(
            temp_list[0]
            [0]) / sum_nucl_count  # calculate the frequency of this nucleotide

        # Get the output for current position
        current_line = []
        current_line = [str(pos + 1)]
        for nucl in nucl_list:
            current_line.append(str(nucl_freq[pos][nucl]))
        current_line.append(most_freq_nucl)
        current_line.append(str(most_freq_nucl_freq))
        current_line = '\t'.join(current_line)
        output_content.append(current_line)

        with open(output_file, 'wb') as f:
            for line in output_content:
                f.write("%s\n" % line)
    print('A report has been written to {0}'.format(output_file))
    print(
        'A total of {0} nucleotide has unknown letter (only uppercase was counted).'
        .format(unidentified_count))
Ejemplo n.º 36
0
def main(name_space):
    from lib import File_IO
    from lib import Seq_IO
    import argparse
    import textwrap
    import time

    parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
                                     epilog=textwrap.dedent('''\
                                        ------------------------
                                        By Zewei Song
                                        University of Minnesota
                                        Dept. Plant Pathology
                                        [email protected]
                                        ------------------------'''), prog='fast.py -stat_seqs')
    parser.add_argument("-i", "--input", help="Name of the input sequence file")
    group = parser.add_mutually_exclusive_group()
    group.add_argument("-o", "--output", default='report.txt', help="Specify a report file for output")
    args = parser.parse_args(name_space)
    seq_file = args.input
    report_file = args.output

    print("Reading in %s ..." % seq_file)
    seq_content = File_IO.read_seqs(seq_file)
    print('Found %d sequences in this curernt file, analyzing ...' % len(seq_content))
    start = time.time()
    #count = 0
    seq_length = {}
    seq_ambiguous = {}
    seq_homop = {'All': {}, 'A': {}, 'T': {}, 'C': {}, 'G': {}}
    seq_total_bases = {'A': 0, 'T': 0, 'C': 0, 'G': 0}

    for record in seq_content:
        temp_seq = record[1]
        temp_length = len(temp_seq)
        try:
            seq_length[temp_length] += 1
        except KeyError:
            seq_length[temp_length] = 1

        temp_ambiguous = Seq_IO.count_ambiguous(temp_seq)
        try:
            seq_ambiguous[temp_ambiguous] += 1
        except KeyError:
            seq_ambiguous[temp_ambiguous] = 1

        temp_homop = Seq_IO.count_homop(temp_seq)
        for base in temp_homop:
            temp_max_length = temp_homop[base]
            try:
                seq_homop[base][temp_max_length] += 1
            except KeyError:
                seq_homop[base][temp_max_length] = 1

        temp_bases_count = Seq_IO.count_bases(temp_seq)
        for key in seq_total_bases:
            seq_total_bases[key] += temp_bases_count[key]

    end = time.time()
    used_time = round(end - start, 2)
    print('Finished analyzing, used %s second, printing report ...' % str(used_time))

    # Make all four homopolyer distribution the same length
    all_bases_homop_len = []
    for base in seq_homop:
        for base_length in seq_homop[base]:
            if base_length not in all_bases_homop_len:
                all_bases_homop_len.append(base_length)
    for base in seq_homop:
        for length in all_bases_homop_len:
            try:
                seq_homop[base][length]
            except KeyError:
                seq_homop[base][length] = 0

    # Get all possible sequence length
    all_length = [i for i in seq_length]
    all_length.sort()

    #%% Write the report
    with open(report_file, 'w') as report:
        report.write('Report:\t%s\n' % report_file)
        report.write('Total number of sequence:\t%d\n\n' % len(seq_content))

        report.write('#' * 100 + '\n')
        report.write('Ambiguous base distribution:\nNumber of N\tNumber of sequences\n')
        for key in sorted(seq_ambiguous.keys()):
            report.write('%d\t%d\n' % (key, seq_ambiguous[key]))

        report.write('#' * 100 + '\n')
        report.write('Max homopolymer distribution:\nMax homopolyer length\tAll bases\tA\tT\tC\tG\n')
        for key in sorted(seq_homop['A'].keys()):
            report.write('%s\t%d\t%d\t%d\t%d\t%d\t\n' % (
                key, seq_homop['All'][key], seq_homop['A'][key], seq_homop['T'][key], seq_homop['C'][key],
                seq_homop['G'][key]))

        report.write('#' * 100 + '\n')
        report.write(
            'Length distribution:\tMaximum length:\t%d\tMinimum length:\t%d\n' % (max(all_length), min(all_length)))
        report.write('Length\tNumber of sequences\n')
        for key in sorted(seq_length.keys(), reverse=True):
            report.write('%d\t%d\n' % (key, seq_length[key]))
    print('Report on %s can be found in %s.' % (seq_file, report_file))
Ejemplo n.º 37
0
def main(name_space):
    from lib import random_subsample as rs
    from lib import ParseOtuTable
    from lib import File_IO
    import argparse
    import textwrap
    import time
    
    parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
                                     epilog=textwrap.dedent('''\
                                    ------------------------
                                    By Zewei Song
                                    University of Minnesota
                                    Dept. Plant Pathology
                                    [email protected]
                                    ------------------------'''), prog='fast.py -rarefy_otu_table')
    parser.add_argument('-otu', help='Input OTU table')
    parser.add_argument('-o', '--output', help='Output OTU table')
    parser.add_argument('-d', '--depth', help='Sampling depth for each sample')
    parser.add_argument('-iter', default=1, help='Iteration time for each sample')
    parser.add_argument('-thread', default=1, help='Number of threads')
    parser.add_argument('-keep_all', action='store_true', help='Indicate to keep all samples')
    parser.add_argument('-meta_column', default='taxonomy', help='Name of the first meta data')
    args = parser.parse_args(name_space)
    
    input_otu = args.otu
    
    iter_num = int(args.iter)
    thread = int(args.thread)
    meta_col = args.meta_column
    if args.output:
        output_otu = args.output
    else:
        output_otu = File_IO.name_file(input_otu, '', 'rare')
    

    otu_table = ParseOtuTable.parser_otu_table(input_otu, meta_col=meta_col)
    input_sample = otu_table.sample_matrix
    start = time.time()

    print('Input OTU table: %s' % input_otu)
    if args.depth:
        depth = int(args.depth)
        print('Sampling depth: %i' % depth)
    else:
        depth = min([sum(i[1:]) for i in input_sample])
        print('Sampling depth set to the minimum abundance: %i' % depth)
    print('Iteration time for each OTU: %i' % iter_num)
    print('Threads number: %i' % thread)
    print('Reading in the OTU table ...')

    if args.keep_all:
        count = 0
        for line in input_sample:
            if sum(line[1:]) < depth:
                count += 1
        print('Found %i samples in the OTU table.' % len(input_sample))
        print('%i samples has total abundance less than the sampling depth, but will be kept in the output.' % count)
    else:
        temp = []
        count = 0
        for line in input_sample:
            if sum(line[1:]) >= depth:
                temp.append(line)
            else:
                count += 1
        input_sample = temp
        print('Found %i samples in the OTU table.' % len(input_sample))
        print('%i samples has total abundance less than the sampling depth, and will be excluded.' % (count))

    otu_id = otu_table.species_id
    otu_table_rarefied = [['OTU_ID'] + otu_id + otu_table.meta_id]

    for sample in input_sample:
        print('Rarefying %s ...' % sample[0])
        repeat_sample = rs.repeat_rarefaction_parallel(sample[1:], depth, iter_num, processor=thread)
        repeat_sample.sort(key=lambda x: sum(i > 0 for i in x))
        repeat_sample = [sample[0]] + repeat_sample[int(iter_num / 2)]  # Pick the rarefied sample with the average richness
        otu_table_rarefied.append(repeat_sample[:])

    otu_table_rarefied = [list(i) for i in zip(*otu_table_rarefied)]

    # Add meta data
    meta_data = otu_table.meta_dict()
    for line in otu_table_rarefied[1:]:
        for key in otu_table.meta_id:
            line.append(meta_data[key][line[0]])
    for key in otu_table.meta_id:
        otu_table_rarefied[0].append(key)

    with open(output_otu, 'w') as f:
        for line in otu_table_rarefied:
            line = [str(i) for i in line]
            f.write('%s\n' % '\t'.join(line))
    print('Rarefied OTU table saved in %s.' % output_otu)
    end = time.time()
    used_time = round(float(end - start), 2)
    time_per_sample = round(used_time / len(input_sample), 2)
    print('Total time used: %s seconds (%s seconds per sample)' % (str(used_time), str(time_per_sample)))
Ejemplo n.º 38
0
def main(Namespace):
    import argparse
    from lib import File_IO
    import sys
    import os
    import time
    import textwrap

    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=textwrap.dedent('''\
                                    ------------------------
                                    By Zewei Song
                                    University of Minnesota
                                    Dept. Plant Pathology
                                    [email protected]
                                    ------------------------'''),
        prog='fast.py -merge_seqs')
    parser.add_argument(
        '-i',
        '--input',
        help='Name of the input folder containing files to be merged')
    parser.add_argument('-o', '--output', help='Name of the merged file')
    group = parser.add_mutually_exclusive_group()
    group.add_argument('-fasta',
                       action='store_true',
                       help='Set the file type to FASTA')
    group.add_argument(
        '-fastq',
        action='store_true',
        help='Set the file type to FASTQ, this is the default option')
    args = parser.parse_args(Namespace)

    input_folder = args.input
    if not input_folder:
        print('please specified an input folder')
        sys.exit()
    output_file = args.output
    if not output_file:
        print('Please specified an output file.')
        sys.exit()
    if os.path.isfile(output_file):
        file_size = round(os.path.getsize(output_file) / 1024**2, 0)
        exist = raw_input(
            '%s (%d MB)already exists , do you want to overwrite it? [y/n]' %
            (output_file, file_size))
        if exist == 'y' or exist == 'Y':
            os.remove(output_file)
        else:
            print('Program stopped.')
            sys.exit()
    file_type = 'fastq'
    if args.fasta:
        file_type = 'fasta'

    start = time.time()
    f_list = File_IO.file_list(input_folder)
    f_list.sort()
    print('Found %i files in the folder %s' % (len(f_list), input_folder))
    count = 0
    n = 1
    count_total = 0
    for seq_file in f_list:
        current_file = input_folder + '/' + seq_file
        count = File_IO.write_seqs(File_IO.read_seqs(current_file, file_type),
                                   output_file,
                                   checker=False,
                                   overwrite=False)
        print('%d. Merged %d sequences from %s into the new file.' %
              (n, count, seq_file))
        n += 1
        count_total += count
    end = time.time()
    used_time = round(end - start, 2)
    print('Spent %s sec to merge %d records in %d files into %s' %
          (str(used_time), count_total, len(f_list), output_file))
Ejemplo n.º 39
0
def main(name_space):
    from lib import random_subsample as rs
    from lib import File_IO
    import argparse
    import textwrap
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=textwrap.dedent('''\
                                        ------------------------
                                        By Zewei Song
                                        University of Minnesota
                                        Dept. Plant Pathology
                                        [email protected]
                                        ------------------------'''),
        prog='fast.py -random_subsample')
    parser.add_argument('-i',
                        '--input',
                        help='Name of the input folder with raw data')
    parser.add_argument('-o',
                        '--output',
                        default='random_dataset',
                        help='Name of the output folder with raw data')
    parser.add_argument('-file_number',
                        default=10,
                        help='Number of file to pick.')
    parser.add_argument('-size',
                        default=10000,
                        help='Sampling size for each file.')

    args = parser.parse_args(name_space)

    input_folder = args.input
    output_folder = args.output

    file_number = int(args.file_number)
    sample_size = int(args.size)

    # Create new folder
    File_IO.mk_dir(output_folder)

    # Randomly pick files to be sampled
    input_file_list = File_IO.file_list(input_folder)
    print('Found {0} files in the folder {1}'.format(len(input_file_list),
                                                     input_folder),
          end='\n')
    file_index = rs.generate_random_index(len(input_file_list), file_number)
    file_list = []
    for index in file_index:
        file_list.append(input_file_list[index])

    # Randomly pick sequences from each file
    for raw_file in file_list:
        print('\tRandoming sampling {0} for {1} sequences ...'.format(
            raw_file, sample_size, end='\r'))
        current_content = File_IO.read_seqs(input_folder + '/' + raw_file)
        seq_index = rs.generate_random_index(len(current_content), sample_size)
        sampled_content = []

        for index in seq_index:
            sampled_content.append(current_content[index])

        count = File_IO.write_seqs(sampled_content,
                                   output_folder + '/' + raw_file)

    print(
        'A randomly sampled dataset ({0} files, {1} sequences per file) was generated under the folder {2}'
        .format(file_number, sample_size, output_folder, end='\n'))
Ejemplo n.º 40
0
def main(Namespace):
    import argparse
    import textwrap
    from lib import File_IO
    from lib import Seq_IO

    parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
                                     epilog=textwrap.dedent('''\
                                    ------------------------
                                    By Zewei Song
                                    University of Minnesota
                                    Dept. Plant Pathology
                                    [email protected]
                                    ------------------------'''), prog='fast.py -nucl_freq')
    parser.add_argument('-i', '--input', help='Name of the input FASTA or FASTQ file.')
    parser.add_argument('-o', '--output', default='nucl_report.txt', help='Name of the reporting file.')
    parser.add_argument('-tail', action='store_true', help='Indicate to also count from the tail of the sequences.')
    args = parser.parse_args(Namespace)

    input_file = args.input
    output_file = args.output
    tail_indicator = args.tail

    print('Reading in file: {0} ...'.format(input_file))
    input_seq = File_IO.read_seqs(input_file)
    print('The file contains {0} sequences.'.format(len(input_seq)))
    if tail_indicator:
        print('Counting nucleotide frequencies from both ends of all sequences...')
    else:
        print('Counting nucleotide frequencies from the head of all sequences...')

    nucl_freq, unidentified_count = Seq_IO.nucl_freq(input_seq, tail = tail_indicator)

    nucl_list = ['A','T','C','G','N']

    # Output the counting result
    header = 'Position\tA\tT\tC\tG\tN\tMost frequent\tFrequency'
    output_content = [header]
    for pos in range(len(nucl_freq)):

        # Get the most frequent nucleotide at this position:
        temp_list = []
        most_freq_nucl = ""
        for nucl in nucl_list:
            temp_list.append([nucl_freq[pos][nucl],nucl])
        temp_list.sort(reverse=True)
        most_freq_nucl = temp_list[0][1]
        sum_nucl_count = sum(nucl_freq[pos].values())
        most_freq_nucl_freq = float(temp_list[0][0]) / sum_nucl_count # calculate the frequency of this nucleotide

        # Get the output for current position
        current_line = []
        current_line = [str(pos + 1)]
        for nucl in nucl_list:
            current_line.append(str(nucl_freq[pos][nucl]))
        current_line.append(most_freq_nucl)
        current_line.append(str(most_freq_nucl_freq))
        current_line = '\t'.join(current_line)
        output_content.append(current_line)

        with open(output_file, 'w') as f:
            for line in output_content:
                f.write("%s\n" %line)
    print('A report has been written to {0}'.format(output_file))
    print('A total of {0} nucleotide has unknown letter (only uppercase was counted).'.format(unidentified_count))
Ejemplo n.º 41
0
def main(Namespace):    
    import argparse
    import textwrap
    from lib import ParseOtuMap
    from lib import File_IO
    import sys
    
    parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
                                     epilog=textwrap.dedent('''\
                                    ------------------------
                                    By Zewei Song
                                    University of Minnesota
                                    Dept. Plant Pathology
                                    [email protected]
                                    ------------------------'''), prog='fast.py -filter_otu_map')
    parser.add_argument('-i', '--input', help='Input OTU map')
    parser.add_argument('-o', '--output', help='Output OTU map')
    group = parser.add_mutually_exclusive_group()
    group.add_argument('-min_size', default=2, help='The minimum size of an OTU to be kept')
    group.add_argument('-name_list', help='A file contains a list of sequence name to be picked')
    group.add_argument('-fasta', help='A FASTA file contains sequence to be picked')
    args = parser.parse_args(Namespace)
    
    map_file = args.input
    output_file = args.output
    if args.min_size:
        min_size = int(args.min_size)
    if args.name_list:
        with open(args.name_list, 'rU') as f:
            pick_list = []
            for line in f:
                pick_list.append(line.strip('\n'))
    if args.fasta:
        seqs = File_IO.read_seqs(args.fasta)
        pick_list = []
        for record in seqs:
            pick_list.append(record[0])
    
    print('Reading in %s ...' % map_file)
    MapDict = ParseOtuMap.read_otu_map(map_file)
    
    # Filter OTU map based on parameters
    if args.min_size:
        print('Filtering OTUs with less than %d sequences ...' % min_size)
        MapDictFiltered = ParseOtuMap.filter_by_size(MapDict, min_size=min_size)
    
    if args.name_list or args.fasta:
        print('Pick OTUs based on the names in %s ...' % args.name_list)
        MapDictFiltered = {}
        for name in pick_list:
            try:
                MapDictFiltered[name] = MapDict[name]
            except KeyError:
                print('Cannot find %s in the OTU map. Program exits.')
                sys.exit()
    
    #  Report comparison of original and filtered maps
    old_map = ParseOtuMap.otu_map_parser(MapDict)
    new_map = ParseOtuMap.otu_map_parser(MapDictFiltered)
    
    print('\n')
    print('Original OTU map:')
    print('\t OTU=%i (Total Sequences=%i, Max=%i, Min=%i, Ave=%i)' % (
        old_map.derep_count, old_map.seqs_count, old_map.max_derep, old_map.min_derep, old_map.ave_derep))
    print('Filtered OTU map:')
    print('\t OTU=%i (Total Sequences=%i, Max=%i, Min=%i, Ave=%i)' % (
        new_map.derep_count, new_map.seqs_count, new_map.max_derep, new_map.min_derep, new_map.ave_derep))
    
    print('Writing new map ...')
    ParseOtuMap.write_otu_map(MapDictFiltered, output_file=output_file)
    print('New map saved in %s.' % output_file)
Ejemplo n.º 42
0
def main(name_space):
    import argparse
    import textwrap
    from lib import ParseOtuMap
    from lib import File_IO

    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=textwrap.dedent('''\
                                    ------------------------
                                    By Zewei Song
                                    University of Minnesota
                                    Dept. Plant Pathology
                                    [email protected]
                                    ------------------------'''),
        prog='fast.py -subset_fast_hybrid')
    parser.add_argument('-i', '--input', help='Input of FAST hybrid map.')
    parser.add_argument('-o',
                        '--output',
                        help='Output prefix for the derep map and sequence')
    group = parser.add_mutually_exclusive_group()
    group.add_argument('-otu_list',
                       help='A list of OTU names seperated by ","')
    group.add_argument('-otu_file',
                       help='A file contains a list of OTU names (no header)')
    args = parser.parse_args(name_space)

    print('Subtracting a FAST hybrid map with provieded OTU names ...')
    input_file = args.input
    output_derep = args.output + '.txt'
    output_fasta = args.output + '.fasta'

    otu_list = []
    if args.otu_list:
        otu_list = args.otu_list.split(',')
    elif args.otu_file:
        otu_list = []
        with open(args.otu_file) as f:
            for line in f:
                otu_list.append(line)
    print('Found {0} OTU names.'.format(len(otu_list)))

    print('Reading in the FAST hybrid map: {0} ...'.format(input_file))
    hybrid_map = ParseOtuMap.read_fast_output(input_file)
    fast_derep = {}
    for otu in otu_list:
        fast_derep.update(hybrid_map[otu]['sample'])
    ParseOtuMap.write_fast_output(fast_derep, output_derep)
    print('A FAST derep map wrote to: {0}.'.format(output_derep))

    derep_seq = []
    for key, value in fast_derep.items():
        current_seq = []
        derep_size = sum(value['sample'].values())
        seq_label = key + ';size=' + str(derep_size)
        current_seq = [derep_size, seq_label, value['seq']]
        derep_seq.append(current_seq)

    derep_seq.sort(reverse=True)
    derep_seq = [i[1:] for i in derep_seq]

    count = File_IO.write_seqs(derep_seq, output_fasta, checker=False)
    print(
        'A dereplicated FASTA file wrote to {0}, containing {1} sequences with size annotation.'
        .format(output_fasta, count))
    print('\n')