Beispiel #1
0
def write_swarms(output_fasta_file, output_counts_file, output_map_file, min_samples, min_count):
    # set at least one sample where counts not given
    for swarm_id in dict_swarm_counts:
        if not swarm_id in dict_swarm_num_samples:
            dict_swarm_num_samples[swarm_id] = 1
    
    out_handle1 = sys.stdout
    if output_fasta_file:
        out_handle1 = happyfile.hopen_write_or_else(output_fasta_file)

    if verbose and output_fasta_file:
        print("Writing FASTA file: " + output_fasta_file, file=sys.stderr)

    for swarm_id in dict_swarm_counts:
        if dict_swarm_num_samples[swarm_id] >= min_samples and dict_swarm_counts[swarm_id] >= min_count:
            print(">" + swarm_id + "\n" + dict_swarm_seq[swarm_id], file=out_handle1)

    out_handle1.close()

    if output_counts_file:
        out_handle2 = happyfile.hopen_write_or_else(output_counts_file)

        if verbose:
            print("Writing counts file: " + output_counts_file, file=sys.stderr)

        column_names = ['id']
        for name in sample_list:
            if name in dict_sample_name:
                column_names.append(dict_sample_name[name])
            else:
                column_names.append(name)

        print("\t".join(column_names), file=out_handle2)

        for swarm_id in dict_swarm_counts:
            if dict_swarm_num_samples[swarm_id] >= min_samples and dict_swarm_counts[swarm_id] >= min_count:
                samplecounts = []
                for i in range(len(sample_list)):
                    samplecounts.append(dict_swarm_sample_counts.get((swarm_id, i), 0))
                print(swarm_id + "\t" + "\t".join(str(x) for x in samplecounts), file=out_handle2)

        out_handle2.close()

    if output_map_file:
        out_handle3 = happyfile.hopen_write_or_else(output_map_file)
            
        if verbose:
            print("Writing map file: " + output_map_file, file=sys.stderr)

        for id in sorted(dict_id_swarm, key=dict_id_swarm.get):
            swarm_id = dict_id_swarm[id]
            if dict_swarm_num_samples[swarm_id] >= min_samples and dict_swarm_counts[swarm_id] >= min_count:
                print(swarm_id + "\t" + id, file=out_handle3)

        out_handle3.close()
def write_swarms(output_counts_file):
    out_handle = sys.stdout
    if output_counts_file:
        out_handle = happyfile.hopen_write_or_else(output_counts_file)

    if verbose and output_counts_file:
        print >>sys.stderr, "Writing counts file: " + output_counts_file

    column_names = ['id', 'besthit', 'taxonomy']
    for name in sample_list:
        if name in dict_sample_name:
            column_names.append(dict_sample_name[name])
        else:
            column_names.append(name)

    print >>out_handle, "\t".join(column_names)

    for swarm_id in dict_swarm_counts:
        besthit = dict_swarm_best_hit.get(swarm_id, "")
        tax = ""
        if besthit:
            tax = dict_id_taxonomy.get(besthit, "")
        samplecounts = [swarm_id, besthit, tax]
        for i in range(len(sample_list)):
            samplecounts.append(dict_swarm_sample_counts.get((swarm_id, i), 0))
        print >>out_handle, "\t".join(str(x) for x in samplecounts)

    if output_counts_file:
        out_handle.close()
Beispiel #3
0
def write_swarm_content(fasta_file, swarm_content_fasta_file):
    swarm_content_size = 0
    in_handle = happyfile.hopen_or_else(fasta_file)
        
    if verbose:
        print("Reading FASTA file: " + fasta_file, file=sys.stderr)
        
    out_handle = happyfile.hopen_write_or_else(swarm_content_fasta_file)

    if verbose:
        print("Writing swarm content FASTA file: " + swarm_content_fasta_file, file=sys.stderr)
        
    write_out = False
    while 1:
        line = in_handle.readline()
        if not line:
            break
        line = line.rstrip()
        
        if line.startswith(">"):
            id = re.split('\s', line[1:])[0]
            if id in dict_derep_ids:
                write_out = True
                swarm_content_size += 1
            else:
                write_out = False

        if write_out:
            print(line, file=out_handle)
    
    in_handle.close()
    out_handle.close()

    return swarm_content_size
Beispiel #4
0
def filter_fastq(fastq_file, output_file, min_quality, min_seq_len,
                 max_seq_len):
    in_handle = happyfile.hopen_or_else(fastq_file)

    if verbose:
        print("Reading FASTQ file: " + fastq_file, file=sys.stderr)

    out_handle = sys.stdout
    if output_file:
        out_handle = happyfile.hopen_write_or_else(output_file)

    if verbose:
        print("Writing FASTA file: " + output_file, file=sys.stderr)

    rnum = 1
    id = ""
    seq = ""
    qual = ""
    while 1:
        line = in_handle.readline()
        if not line:
            break
        line = line.rstrip()
        if rnum == 1:
            id = re.split('\s', line[1:])[0]
        elif rnum == 2:
            seq = line
        elif rnum == 4:
            qual = line
            filter_line(out_handle, id, seq, qual, min_quality, min_seq_len,
                        max_seq_len)
        rnum += 1
        if rnum > 4:
            rnum = 1
Beispiel #5
0
def filter_fastq(fastq_file, output_file, min_quality, min_seq_len, max_seq_len):
    in_handle = happyfile.hopen_or_else(fastq_file)
    
    if verbose:
        print("Reading FASTQ file: " + fastq_file, file=sys.stderr)

    out_handle = sys.stdout
    if output_file:
        out_handle = happyfile.hopen_write_or_else(output_file)

    if verbose:
        print("Writing FASTA file: " + output_file, file=sys.stderr)

    rnum = 1
    id = ""
    seq = ""
    qual = ""
    while 1:
        line = in_handle.readline()
        if not line:
            break
        line = line.rstrip()
        if rnum == 1:
            id = re.split('\s', line[1:])[0]
        elif rnum == 2:
            seq = line
        elif rnum == 4:
            qual = line
            filter_line(out_handle, id, seq, qual, min_quality, min_seq_len, max_seq_len)
        rnum += 1
        if rnum > 4:
            rnum = 1
def write_swarms(output_counts_file):
    out_handle = sys.stdout
    if output_counts_file:
        out_handle = happyfile.hopen_write_or_else(output_counts_file)

    if verbose and output_counts_file:
        print("Writing counts file: " + output_counts_file, file=sys.stderr)

    column_names = ['id', 'besthit', 'taxonomy']
    for name in sample_list:
        if name in dict_sample_name:
            column_names.append(dict_sample_name[name])
        else:
            column_names.append(name)

    print("\t".join(column_names), file=out_handle)

    for swarm_id in dict_swarm_counts:
        besthit = dict_swarm_best_hit.get(swarm_id, "")
        tax = ""
        if besthit:
            tax = dict_id_taxonomy.get(besthit, "")
        samplecounts = [swarm_id, besthit, tax]
        for i in range(len(sample_list)):
            samplecounts.append(dict_swarm_sample_counts.get((swarm_id, i), 0))
        print("\t".join(str(x) for x in samplecounts), file=out_handle)

    if output_counts_file:
        out_handle.close()
Beispiel #7
0
def write_group_counts(output_groups_file):
    out_handle = sys.stdout
    if output_groups_file:
        out_handle = happyfile.hopen_write_or_else(output_groups_file)

    if verbose and output_groups_file:
        print >> sys.stderr, "Writing group counts file: " + output_groups_file

    column_names = ['group']
    for name in sample_list:
        column_names.append(name)

    print >> out_handle, "\t".join(column_names)

    for group_name in sorted(dict_group_counts,
                             key=lambda x: dict_group_counts.get(x),
                             reverse=True):
        samplecounts = [group_name]
        for i in range(len(sample_list)):
            samplecounts.append(
                dict_group_sample_counts.get((group_name, i), 0))
        print >> out_handle, "\t".join(str(x) for x in samplecounts)

    if output_groups_file:
        out_handle.close()
Beispiel #8
0
def write_swarm_content(fasta_file, swarm_content_fasta_file):
    swarm_content_size = 0
    in_handle = happyfile.hopen_or_else(fasta_file)
        
    if verbose:
        print >>sys.stderr, "Reading FASTA file: " + fasta_file
        
    out_handle = happyfile.hopen_write_or_else(swarm_content_fasta_file)

    if verbose:
        print >>sys.stderr, "Writing swarm content FASTA file: " + swarm_content_fasta_file
        
    write_out = False
    while 1:
        line = in_handle.readline()
        if not line:
            break
        line = line.rstrip()
        
        if line.startswith(">"):
            id = re.split('\s', line[1:])[0]
            if id in dict_derep_ids:
                write_out = True
                swarm_content_size += 1
            else:
                write_out = False

        if write_out:
            print >>out_handle, line
    
    in_handle.close()
    out_handle.close()

    return swarm_content_size
Beispiel #9
0
def write_group_counts(output_groups_file):
    out_handle = sys.stdout
    if output_groups_file:
        out_handle = happyfile.hopen_write_or_else(output_groups_file)

    if verbose and output_groups_file:
        xprint("Writing group counts file: " + output_groups_file)

    column_names = ['group']
    for name in sample_list:
        column_names.append(name)

    out_handle.write("\t".join(column_names) + "\n")

    for group_name in sorted(dict_group_counts, key=lambda x: dict_group_counts.get(x), reverse=True):
        samplecounts = [group_name]
        for i in range(len(sample_list)):
            samplecounts.append(dict_group_sample_counts.get((group_name, i), 0))
        out_handle.write("\t".join(str(x) for x in samplecounts) + "\n")

    if output_groups_file:
        out_handle.close()
Beispiel #10
0
def write_swarms(output_fasta_file, output_counts_file, output_map_file,
                 min_samples, min_count):
    # set at least one sample where counts not given
    for swarm_id in dict_swarm_counts:
        if not swarm_id in dict_swarm_num_samples:
            dict_swarm_num_samples[swarm_id] = 1

    out_handle1 = sys.stdout
    if output_fasta_file:
        out_handle1 = happyfile.hopen_write_or_else(output_fasta_file)

    if verbose and output_fasta_file:
        print("Writing FASTA file: " + output_fasta_file, file=sys.stderr)

    for swarm_id in dict_swarm_counts:
        if dict_swarm_num_samples[
                swarm_id] >= min_samples and dict_swarm_counts[
                    swarm_id] >= min_count:
            print(">" + swarm_id + "\n" + dict_swarm_seq[swarm_id],
                  file=out_handle1)

    out_handle1.close()

    if output_counts_file:
        out_handle2 = happyfile.hopen_write_or_else(output_counts_file)

        if verbose:
            print("Writing counts file: " + output_counts_file,
                  file=sys.stderr)

        column_names = ['id']
        for name in sample_list:
            if name in dict_sample_name:
                column_names.append(dict_sample_name[name])
            else:
                column_names.append(name)

        print("\t".join(column_names), file=out_handle2)

        for swarm_id in dict_swarm_counts:
            if dict_swarm_num_samples[
                    swarm_id] >= min_samples and dict_swarm_counts[
                        swarm_id] >= min_count:
                samplecounts = []
                for i in range(len(sample_list)):
                    samplecounts.append(
                        dict_swarm_sample_counts.get((swarm_id, i), 0))
                print(swarm_id + "\t" +
                      "\t".join(str(x) for x in samplecounts),
                      file=out_handle2)

        out_handle2.close()

    if output_map_file:
        out_handle3 = happyfile.hopen_write_or_else(output_map_file)

        if verbose:
            print("Writing map file: " + output_map_file, file=sys.stderr)

        for id in sorted(dict_id_swarm, key=dict_id_swarm.get):
            swarm_id = dict_id_swarm[id]
            if dict_swarm_num_samples[
                    swarm_id] >= min_samples and dict_swarm_counts[
                        swarm_id] >= min_count:
                print(swarm_id + "\t" + id, file=out_handle3)

        out_handle3.close()
Beispiel #11
0
def write_purity(output_swarm_content_tax_file, output_swarm_purity_file, output_purity_pdf):
    if output_swarm_content_tax_file:
        out_handle1 = happyfile.hopen_write_or_else(output_swarm_content_tax_file)
        if verbose:
            print("Writing swarm content taxonomy file: " + output_swarm_content_tax_file, file=sys.stderr)
        
        print("\t".join(['id', 'swarm', 'besthit', 'taxonomy']), file=out_handle1)

        for id in dict_derep_ids:
            swarm_id = dict_id_swarm.get(id, "")
            besthit = dict_id_best_hit.get(id, "")
            tax = ""
            if besthit:
                tax = dict_id_taxonomy.get(besthit, "")

            print("\t".join([id, swarm_id, besthit, tax]), file=out_handle1)

        out_handle1.close()

    out_handle2 = happyfile.hopen_write_or_else(output_swarm_purity_file)

    if verbose:
        print("Writing swarm purity file: " + output_swarm_purity_file, file=sys.stderr)

    count_all = {}
    count_same_tax = {}
    for id in dict_derep_ids:
        id_key, id_size = id.split('_')[:2]
        swarm_id = dict_id_swarm.get(id, "")
        
        derep_size = int(id_size)
        if derep_size < 1:
            derep_size = 1
        
        if swarm_id:
            count_all[swarm_id] = count_all.get(swarm_id, 0) + derep_size
            if id == swarm_id:
                count_same_tax[swarm_id] = count_same_tax.get(swarm_id, 0) + derep_size
            else:
                besthit = dict_id_best_hit.get(id, "")
                besthit_swarm = dict_id_best_hit.get(swarm_id, "")
                if besthit and besthit_swarm:
                    id_tax = dict_id_taxonomy.get(besthit, "")
                    swarm_tax = dict_id_taxonomy.get(besthit_swarm, "")
                    if id_tax and id_tax == swarm_tax:
                        count_same_tax[swarm_id] = count_same_tax.get(swarm_id, 0) + derep_size

    print("\t".join(['swarm_id', 'taxonomy', 'size', 'same_taxon', 'purity']), file=out_handle2)

    count_pure_OTUs = 0
    for swarm_id in count_all:
        print("\t".join([swarm_id, dict_id_taxonomy.get(swarm_id, ""), str(count_all[swarm_id]), str(count_same_tax.get(swarm_id, 0)), str(1.0 * count_same_tax.get(swarm_id, 0) / count_all[swarm_id])]), file=out_handle2)
        if count_same_tax.get(swarm_id, 0) == count_all[swarm_id]:
            count_pure_OTUs += 1

    out_handle2.close()

    if len(count_all):
        print("OTUs 100% purity: " + str(count_pure_OTUs) + " / " + str(len(count_all)) + " (" + str(round(100.0 * count_pure_OTUs / len(count_all), 1)) + "%)", file=sys.stderr)

    cmd = " ".join([R_script, output_swarm_purity_file, output_purity_pdf])

    if verbose:
        print(cmd, file=sys.stderr)
        
    rc = os.system(cmd + " >/dev/null")
    if rc != 0:
        print("[purity_plot] ERROR: " + R_script, file=sys.stderr)
        sys.exit(2)
Beispiel #12
0
def remove_plastid_seqs(output_base_file):
    dict_plastid = {}
    swarm_tax = output_base_file + ".swarm.tax"

    derep_fa = output_base_file + ".derep.fa"
    derep_counts = output_base_file + ".derep.counts"
    swarm_table = output_base_file + ".swarm"
    swarm_fa = output_base_file + ".swarm.fa"
    swarm_counts = output_base_file + ".swarm.counts"
    
    derep_plastid_fa = output_base_file + ".plastid.derep.fa"
    derep_plastid_counts = output_base_file + ".plastid.derep.counts"
    swarm_plastid_table = output_base_file + ".plastid.swarm"
    swarm_plastid_fa = output_base_file + ".plastid.swarm.fa"
    swarm_plastid_counts = output_base_file + ".plastid.swarm.counts"

    tmp_derep_16S_fa = derep_fa + ".tmp"
    tmp_derep_16S_counts = derep_counts + ".tmp"
    tmp_swarm_16S_table = swarm_table + ".tmp"
    tmp_swarm_16S_fa = swarm_fa + ".tmp"
    tmp_swarm_16S_counts = swarm_counts + ".tmp"
    tmp_swarm_16S_tax = swarm_tax + ".tmp"

    if overwrite or not os.path.exists(swarm_plastid_fa):
        if verbose:
            print >>sys.stderr, "Filtering chloroplast sequences"

        # split 16S/Plastid swarm taxonomy table
        in_handle_tax = happyfile.hopen_or_else(swarm_tax)
        out_handle_16S_tax = happyfile.hopen_write_or_else(tmp_swarm_16S_tax)

        firstline = 1
        while 1:
            line = in_handle_tax.readline()
            if not line:
                break
            line = line.rstrip()
            cols = line.split('\t')

            if firstline:
                print >>out_handle_16S_tax, line
            else:
                m = re.match('Bacteria;Cyanobacteria;Chloroplast', cols[2])
                if m:
                    dict_plastid[cols[0]] = 1
                else:
                    print >>out_handle_16S_tax, line

            firstline = 0

        in_handle_tax.close()
        out_handle_16S_tax.close()

        # split 16S/Plastid swarm file
        in_handle_table = happyfile.hopen_or_else(swarm_table)
        out_handle_16S_table = happyfile.hopen_write_or_else(tmp_swarm_16S_table)
        out_handle_plastid_table = happyfile.hopen_write_or_else(swarm_plastid_table)
        
        while 1:
            line = in_handle_table.readline()
            if not line:
                break
            line = line.rstrip()
            id_list = re.split('\s', line)
            swarm_id = id_list[0]
            
            if swarm_id in dict_plastid:
                print >>out_handle_plastid_table, line
                for id in id_list:
                    dict_plastid[id] = 1
            else:
                print >>out_handle_16S_table, line

        in_handle_table.close()
        out_handle_16S_table.close()
        out_handle_plastid_table.close()

        # split 16S/Plastid derep FASTA
        in_handle_derep_fa = happyfile.hopen_or_else(derep_fa)
        out_handle_16S_derep_fa = happyfile.hopen_write_or_else(tmp_derep_16S_fa)
        out_handle_plastid_derep_fa = happyfile.hopen_write_or_else(derep_plastid_fa)
        
        id = ""
        while 1:
            line = in_handle_derep_fa.readline()
            if not line:
                break
            line = line.rstrip()
            
            if line.startswith(">"):
                id = re.split('\s', line[1:])[0]
            
            if id:
                if id in dict_plastid:
                    print >>out_handle_plastid_derep_fa, line
                else:
                    print >>out_handle_16S_derep_fa, line

        in_handle_derep_fa.close()
        out_handle_16S_derep_fa.close()
        out_handle_plastid_derep_fa.close()
        
        # split 16S/Plastid derep counts table
        in_handle_derep_counts = happyfile.hopen_or_else(derep_counts)
        out_handle_16S_derep_counts = happyfile.hopen_write_or_else(tmp_derep_16S_counts)
        out_handle_plastid_derep_counts = happyfile.hopen_write_or_else(derep_plastid_counts)
        
        firstline = 1
        while 1:
            line = in_handle_derep_counts.readline()
            if not line:
                break
            line = line.rstrip()
            cols = line.split('\t')
            
            if firstline:
                print >>out_handle_16S_derep_counts, line
                print >>out_handle_plastid_derep_counts, line
            else:
                if cols[0] in dict_plastid:
                    print >>out_handle_plastid_derep_counts, line
                else:
                    print >>out_handle_16S_derep_counts, line

            firstline = 0

        in_handle_derep_counts.close()
        out_handle_16S_derep_counts.close()
        out_handle_plastid_derep_counts.close()

        # split 16S/Plastid swarm FASTA
        in_handle_fa = happyfile.hopen_or_else(swarm_fa)
        out_handle_16S_fa = happyfile.hopen_write_or_else(tmp_swarm_16S_fa)
        out_handle_plastid_fa = happyfile.hopen_write_or_else(swarm_plastid_fa)

        id = ""
        while 1:
            line = in_handle_fa.readline()
            if not line:
                break
            line = line.rstrip()
            
            if line.startswith(">"):
                id = re.split('\s', line[1:])[0]

            if id:
                if id in dict_plastid:
                    print >>out_handle_plastid_fa, line
                else:
                    print >>out_handle_16S_fa, line

        in_handle_fa.close()
        out_handle_16S_fa.close()
        out_handle_plastid_fa.close()

        # split 16S/Plastid swarm counts table
        in_handle_counts = happyfile.hopen_or_else(swarm_counts)
        out_handle_16S_counts = happyfile.hopen_write_or_else(tmp_swarm_16S_counts)
        out_handle_plastid_counts = happyfile.hopen_write_or_else(swarm_plastid_counts)

        firstline = 1
        while 1:
            line = in_handle_counts.readline()
            if not line:
                break
            line = line.rstrip()
            cols = line.split('\t')

            if firstline:
                print >>out_handle_16S_counts, line
                print >>out_handle_plastid_counts, line
            else:
                if cols[0] in dict_plastid:
                    print >>out_handle_plastid_counts, line
                else:
                    print >>out_handle_16S_counts, line

            firstline = 0
        
        in_handle_counts.close()
        out_handle_16S_counts.close()
        out_handle_plastid_counts.close()

        # replace original swarm files with 16S only
        if os.path.exists(tmp_derep_16S_fa) and os.path.exists(tmp_derep_16S_counts) and os.path.exists(tmp_swarm_16S_table) and os.path.exists(tmp_swarm_16S_tax) and os.path.exists(tmp_swarm_16S_fa) and os.path.exists(tmp_swarm_16S_counts):
            replace_file(tmp_derep_16S_fa, derep_fa)
            replace_file(tmp_derep_16S_counts, derep_counts)
            replace_file(tmp_swarm_16S_table, swarm_table)
            replace_file(tmp_swarm_16S_tax, swarm_tax)
            replace_file(tmp_swarm_16S_fa, swarm_fa)
            replace_file(tmp_swarm_16S_counts, swarm_counts)
        else:
            print >>sys.stderr, "Not all tmp_ files found"
            sys.exit(2)
def write_dereps(output_fasta_file, output_counts_file, output_map_file, id_format, min_samples, min_count):
    dict_bestid = {}
    dict_id_num_samples = {}
    
    for key in dict_id_counts:
        for filenum in range(len(good_fasta_files)):
            if dict_id_file_counts.get((key, filenum), 0) > 0:
                dict_id_num_samples[key] = dict_id_num_samples.get(key, 0) + 1

    out_handle1 = sys.stdout
    if output_fasta_file:
        out_handle1 = happyfile.hopen_write_or_else(output_fasta_file)

    if verbose and output_fasta_file:
        xprint("Writing FASTA file: " + output_fasta_file)

    if id_format == Format.bestid:
        for id in dict_id_map:
            key = dict_id_map[id]
            if (not key in dict_bestid) and dict_id_counts.get(key, 0) > 0:
                dict_bestid[key] = id

    for key in dict_id_counts:
        if dict_id_num_samples.get(key, 0) >= min_samples and dict_id_counts[key] >= min_count and key in dict_id_seq:
            if id_format == Format.swarm:
                out_handle1.write(">" + key + "_" + str(dict_id_counts[key]) + "\n" + dict_id_seq[key] +"\n")
            elif id_format == Format.bestid and key in dict_bestid:
                out_handle1.write(">" + dict_bestid[key] + "\n" + dict_id_seq[key] + "\n")

    out_handle1.close()

    if output_counts_file:
        out_handle2 = happyfile.hopen_write_or_else(output_counts_file)

        if verbose:
            xprint("Writing counts file: " + output_counts_file)

        column_names = ['id']
        for file in good_fasta_files:
            if file in dict_sample_name:
                column_names.append(dict_sample_name[file])
            else:
                column_names.append(re.sub('\.filtered\.fa$', '', file))

        out_handle2.write("\t".join(column_names) + "\n")

        for key in dict_id_counts:
            if dict_id_num_samples.get(key, 0) >= min_samples and dict_id_counts[key] >= min_count:
                samplecounts = []
                id = key + "_" + str(dict_id_counts[key])
                if id_format == Format.bestid:
                    id = re.split('\s', dict_bestid[key])[0]
                for filenum in range(len(good_fasta_files)):
                    samplecounts.append(dict_id_file_counts.get((key, filenum), 0))
                out_handle2.write(id + "\t" + "\t".join(str(x) for x in samplecounts) + "\n")

        out_handle2.close()

    if output_map_file:
        out_handle3 = happyfile.hopen_write_or_else(output_map_file)
            
        if verbose:
            xprint("Writing map file: " + output_map_file)

        for id in sorted(dict_id_map, key=dict_id_map.get):
            key = dict_id_map[id]
            if dict_id_num_samples.get(key, 0) >= min_samples and dict_id_counts[key] >= min_count:
                if id_format == Format.swarm:
                    out_handle3.write(key + "_" + str(dict_id_counts[key]) + "\t" + id + "\n")
                elif id_format == Format.bestid:
                    out_handle3.write(re.split('\s', dict_bestid[key])[0] + "\t" + id + "\n")

        out_handle3.close()
Beispiel #14
0
def write_dereps(output_fasta_file, output_counts_file, output_map_file,
                 id_format, min_samples, min_count):
    dict_bestid = {}
    dict_id_num_samples = {}

    for key in dict_id_counts:
        for filenum in range(len(good_fasta_files)):
            if dict_id_file_counts.get((key, filenum), 0) > 0:
                dict_id_num_samples[key] = dict_id_num_samples.get(key, 0) + 1

    out_handle1 = sys.stdout
    if output_fasta_file:
        out_handle1 = happyfile.hopen_write_or_else(output_fasta_file)

    if verbose and output_fasta_file:
        print("Writing FASTA file: " + output_fasta_file, file=sys.stderr)

    if id_format == Format.bestid:
        for id in dict_id_map:
            key = dict_id_map[id]
            if (not key in dict_bestid) and dict_id_counts.get(key, 0) > 0:
                dict_bestid[key] = id

    for key in dict_id_counts:
        if dict_id_num_samples.get(key, 0) >= min_samples and dict_id_counts[
                key] >= min_count and key in dict_id_seq:
            if id_format == Format.swarm:
                print(">" + key + "_" + str(dict_id_counts[key]) + "\n" +
                      dict_id_seq[key],
                      file=out_handle1)
            elif id_format == Format.bestid and key in dict_bestid:
                print(">" + dict_bestid[key] + "\n" + dict_id_seq[key],
                      file=out_handle1)

    out_handle1.close()

    if output_counts_file:
        out_handle2 = happyfile.hopen_write_or_else(output_counts_file)

        if verbose:
            print("Writing counts file: " + output_counts_file,
                  file=sys.stderr)

        column_names = ['id']
        for file in good_fasta_files:
            if file in dict_sample_name:
                column_names.append(dict_sample_name[file])
            else:
                column_names.append(re.sub('\.filtered\.fa$', '', file))

        print("\t".join(column_names), file=out_handle2)

        for key in dict_id_counts:
            if dict_id_num_samples.get(
                    key,
                    0) >= min_samples and dict_id_counts[key] >= min_count:
                samplecounts = []
                id = key + "_" + str(dict_id_counts[key])
                if id_format == Format.bestid:
                    id = re.split('\s', dict_bestid[key])[0]
                for filenum in range(len(good_fasta_files)):
                    samplecounts.append(
                        dict_id_file_counts.get((key, filenum), 0))
                print(id + "\t" + "\t".join(str(x) for x in samplecounts),
                      file=out_handle2)

        out_handle2.close()

    if output_map_file:
        out_handle3 = happyfile.hopen_write_or_else(output_map_file)

        if verbose:
            print("Writing map file: " + output_map_file, file=sys.stderr)

        for id in sorted(dict_id_map, key=dict_id_map.get):
            key = dict_id_map[id]
            if dict_id_num_samples.get(
                    key,
                    0) >= min_samples and dict_id_counts[key] >= min_count:
                if id_format == Format.swarm:
                    print(key + "_" + str(dict_id_counts[key]) + "\t" + id,
                          file=out_handle3)
                elif id_format == Format.bestid:
                    print(re.split('\s', dict_bestid[key])[0] + "\t" + id,
                          file=out_handle3)

        out_handle3.close()
Beispiel #15
0
def remove_plastid_seqs(output_base_file):
    dict_plastid = {}
    swarm_tax = output_base_file + ".swarm.tax"

    derep_fa = output_base_file + ".derep.fa"
    derep_counts = output_base_file + ".derep.counts"
    swarm_table = output_base_file + ".swarm"
    swarm_fa = output_base_file + ".swarm.fa"
    swarm_counts = output_base_file + ".swarm.counts"

    derep_plastid_fa = output_base_file + ".plastid.derep.fa"
    derep_plastid_counts = output_base_file + ".plastid.derep.counts"
    swarm_plastid_table = output_base_file + ".plastid.swarm"
    swarm_plastid_fa = output_base_file + ".plastid.swarm.fa"
    swarm_plastid_counts = output_base_file + ".plastid.swarm.counts"

    tmp_derep_16S_fa = derep_fa + ".tmp"
    tmp_derep_16S_counts = derep_counts + ".tmp"
    tmp_swarm_16S_table = swarm_table + ".tmp"
    tmp_swarm_16S_fa = swarm_fa + ".tmp"
    tmp_swarm_16S_counts = swarm_counts + ".tmp"
    tmp_swarm_16S_tax = swarm_tax + ".tmp"

    if overwrite or not os.path.exists(swarm_plastid_fa):
        if verbose:
            print >> sys.stderr, "Filtering chloroplast sequences"

        # split 16S/Plastid swarm taxonomy table
        in_handle_tax = happyfile.hopen_or_else(swarm_tax)
        out_handle_16S_tax = happyfile.hopen_write_or_else(tmp_swarm_16S_tax)

        firstline = 1
        while 1:
            line = in_handle_tax.readline()
            if not line:
                break
            line = line.rstrip()
            cols = line.split('\t')

            if firstline:
                print >> out_handle_16S_tax, line
            else:
                m = re.match('Bacteria;Cyanobacteria;Chloroplast', cols[2])
                if m:
                    dict_plastid[cols[0]] = 1
                else:
                    print >> out_handle_16S_tax, line

            firstline = 0

        in_handle_tax.close()
        out_handle_16S_tax.close()

        # split 16S/Plastid swarm file
        in_handle_table = happyfile.hopen_or_else(swarm_table)
        out_handle_16S_table = happyfile.hopen_write_or_else(
            tmp_swarm_16S_table)
        out_handle_plastid_table = happyfile.hopen_write_or_else(
            swarm_plastid_table)

        while 1:
            line = in_handle_table.readline()
            if not line:
                break
            line = line.rstrip()
            id_list = re.split('\s', line)
            swarm_id = id_list[0]

            if swarm_id in dict_plastid:
                print >> out_handle_plastid_table, line
                for id in id_list:
                    dict_plastid[id] = 1
            else:
                print >> out_handle_16S_table, line

        in_handle_table.close()
        out_handle_16S_table.close()
        out_handle_plastid_table.close()

        # split 16S/Plastid derep FASTA
        in_handle_derep_fa = happyfile.hopen_or_else(derep_fa)
        out_handle_16S_derep_fa = happyfile.hopen_write_or_else(
            tmp_derep_16S_fa)
        out_handle_plastid_derep_fa = happyfile.hopen_write_or_else(
            derep_plastid_fa)

        id = ""
        while 1:
            line = in_handle_derep_fa.readline()
            if not line:
                break
            line = line.rstrip()

            if line.startswith(">"):
                id = re.split('\s', line[1:])[0]

            if id:
                if id in dict_plastid:
                    print >> out_handle_plastid_derep_fa, line
                else:
                    print >> out_handle_16S_derep_fa, line

        in_handle_derep_fa.close()
        out_handle_16S_derep_fa.close()
        out_handle_plastid_derep_fa.close()

        # split 16S/Plastid derep counts table
        in_handle_derep_counts = happyfile.hopen_or_else(derep_counts)
        out_handle_16S_derep_counts = happyfile.hopen_write_or_else(
            tmp_derep_16S_counts)
        out_handle_plastid_derep_counts = happyfile.hopen_write_or_else(
            derep_plastid_counts)

        firstline = 1
        while 1:
            line = in_handle_derep_counts.readline()
            if not line:
                break
            line = line.rstrip()
            cols = line.split('\t')

            if firstline:
                print >> out_handle_16S_derep_counts, line
                print >> out_handle_plastid_derep_counts, line
            else:
                if cols[0] in dict_plastid:
                    print >> out_handle_plastid_derep_counts, line
                else:
                    print >> out_handle_16S_derep_counts, line

            firstline = 0

        in_handle_derep_counts.close()
        out_handle_16S_derep_counts.close()
        out_handle_plastid_derep_counts.close()

        # split 16S/Plastid swarm FASTA
        in_handle_fa = happyfile.hopen_or_else(swarm_fa)
        out_handle_16S_fa = happyfile.hopen_write_or_else(tmp_swarm_16S_fa)
        out_handle_plastid_fa = happyfile.hopen_write_or_else(swarm_plastid_fa)

        id = ""
        while 1:
            line = in_handle_fa.readline()
            if not line:
                break
            line = line.rstrip()

            if line.startswith(">"):
                id = re.split('\s', line[1:])[0]

            if id:
                if id in dict_plastid:
                    print >> out_handle_plastid_fa, line
                else:
                    print >> out_handle_16S_fa, line

        in_handle_fa.close()
        out_handle_16S_fa.close()
        out_handle_plastid_fa.close()

        # split 16S/Plastid swarm counts table
        in_handle_counts = happyfile.hopen_or_else(swarm_counts)
        out_handle_16S_counts = happyfile.hopen_write_or_else(
            tmp_swarm_16S_counts)
        out_handle_plastid_counts = happyfile.hopen_write_or_else(
            swarm_plastid_counts)

        firstline = 1
        while 1:
            line = in_handle_counts.readline()
            if not line:
                break
            line = line.rstrip()
            cols = line.split('\t')

            if firstline:
                print >> out_handle_16S_counts, line
                print >> out_handle_plastid_counts, line
            else:
                if cols[0] in dict_plastid:
                    print >> out_handle_plastid_counts, line
                else:
                    print >> out_handle_16S_counts, line

            firstline = 0

        in_handle_counts.close()
        out_handle_16S_counts.close()
        out_handle_plastid_counts.close()

        # replace original swarm files with 16S only
        if os.path.exists(tmp_derep_16S_fa) and os.path.exists(
                tmp_derep_16S_counts
        ) and os.path.exists(tmp_swarm_16S_table) and os.path.exists(
                tmp_swarm_16S_tax) and os.path.exists(
                    tmp_swarm_16S_fa) and os.path.exists(tmp_swarm_16S_counts):
            replace_file(tmp_derep_16S_fa, derep_fa)
            replace_file(tmp_derep_16S_counts, derep_counts)
            replace_file(tmp_swarm_16S_table, swarm_table)
            replace_file(tmp_swarm_16S_tax, swarm_tax)
            replace_file(tmp_swarm_16S_fa, swarm_fa)
            replace_file(tmp_swarm_16S_counts, swarm_counts)
        else:
            print >> sys.stderr, "Not all tmp_ files found"
            sys.exit(2)
Beispiel #16
0
def write_purity(output_swarm_content_tax_file, output_swarm_purity_file, output_purity_pdf):
    if output_swarm_content_tax_file:
        out_handle1 = happyfile.hopen_write_or_else(output_swarm_content_tax_file)
        if verbose:
            print >>sys.stderr, "Writing swarm content taxonomy file: " + output_swarm_content_tax_file
        
        print >>out_handle1, "\t".join(['id', 'swarm', 'besthit', 'taxonomy'])

        for id in dict_derep_ids:
            swarm_id = dict_id_swarm.get(id, "")
            besthit = dict_id_best_hit.get(id, "")
            tax = ""
            if besthit:
                tax = dict_id_taxonomy.get(besthit, "")

            print >>out_handle1, "\t".join([id, swarm_id, besthit, tax])

        out_handle1.close()

    out_handle2 = happyfile.hopen_write_or_else(output_swarm_purity_file)

    if verbose:
        print >>sys.stderr, "Writing swarm purity file: " + output_swarm_purity_file

    count_all = {}
    count_same_tax = {}
    for id in dict_derep_ids:
        id_key, id_size = id.split('_')[:2]
        swarm_id = dict_id_swarm.get(id, "")
        
        derep_size = int(id_size)
        if derep_size < 1:
            derep_size = 1
        
        if swarm_id:
            count_all[swarm_id] = count_all.get(swarm_id, 0) + derep_size
            if id == swarm_id:
                count_same_tax[swarm_id] = count_same_tax.get(swarm_id, 0) + derep_size
            else:
                besthit = dict_id_best_hit.get(id, "")
                besthit_swarm = dict_id_best_hit.get(swarm_id, "")
                if besthit and besthit_swarm:
                    id_tax = dict_id_taxonomy.get(besthit, "")
                    swarm_tax = dict_id_taxonomy.get(besthit_swarm, "")
                    if id_tax and id_tax == swarm_tax:
                        count_same_tax[swarm_id] = count_same_tax.get(swarm_id, 0) + derep_size

    print >>out_handle2, "\t".join(['swarm_id', 'taxonomy', 'size', 'same_taxon', 'purity'])

    count_pure_OTUs = 0
    for swarm_id in count_all:
        print >>out_handle2, "\t".join([swarm_id, dict_id_taxonomy.get(swarm_id, ""), str(count_all[swarm_id]), str(count_same_tax.get(swarm_id, 0)), str(1.0 * count_same_tax.get(swarm_id, 0) / count_all[swarm_id])])
        if count_same_tax.get(swarm_id, 0) == count_all[swarm_id]:
            count_pure_OTUs += 1

    out_handle2.close()

    if len(count_all):
        print >>sys.stderr, "OTUs 100% purity: " + str(count_pure_OTUs) + " / " + str(len(count_all)) + " (" + str(round(100.0 * count_pure_OTUs / len(count_all), 1)) + "%)"

    cmd = " ".join([R_script, output_swarm_purity_file, output_purity_pdf])

    if verbose:
        print >>sys.stderr, cmd
        
    rc = os.system(cmd + " >/dev/null")
    if rc != 0:
        print >>sys.stderr, "[purity_plot] ERROR: " + R_script
        sys.exit(2)