コード例 #1
0
def get_taxonomy(fasta_file, ggsearch_file, database_file, cpus):
    global dict_swarm_best_hit
    global dict_swarm_best_bs
    global dict_id_taxonomy

    if fasta_file:
        if os.path.exists(ggsearch_file):
            if verbose:
                print >>sys.stderr, "Ignoring FASTA file: " + fasta_file
        else:
            print >>sys.stderr, "[swarm_classify_taxonomy] running ggsearch"
            
            if cpus < 1:
                cpus = 1
            
            cmd = " ".join(["glsearch36 -b 1 -m 8 -T", str(cpus), fasta_file, database_file, ">", ggsearch_file])
            
            if verbose:
                print >>sys.stderr, cmd
            
            rc = os.system(cmd)
            if rc != 0:
                print >>sys.stderr, "[swarm_classify_taxonomy] ERROR: ggsearch"
                sys.exit(2)

    in_handle1 = happyfile.hopen_or_else(ggsearch_file)
    if verbose:
        print >>sys.stderr, "Reading ggsearch file: " + ggsearch_file
    
    while 1:
        line = in_handle1.readline()
        if not line:
            break
        line = line.rstrip()

        qid, sid, pid, alen, mm, go, qs, qe, ss, se, e, bs = line.split("\t")[:12]
        if (not qid in dict_swarm_best_bs) or bs > dict_swarm_best_bs[qid]:
            dict_swarm_best_hit[qid] = sid
            dict_swarm_best_bs[qid] = bs
    in_handle1.close()

    in_handle2 = happyfile.hopen_or_else(database_file)
    if verbose:
        print >>sys.stderr, "Reading database file: " + database_file
        
    while 1:
        line = in_handle2.readline()
        if not line:
            break
        line = line.rstrip()
        
        if line.startswith(">"):
            m = re.match('>(\S+)\s+(.+)$', line)
            if m:
                id = m.group(1)
                taxstr = m.group(2)
                taxstr = re.sub('\|', ';', taxstr)
                taxstr = re.sub('\+', ' ', taxstr)
                dict_id_taxonomy[id] = taxstr
    in_handle2.close()
コード例 #2
0
ファイル: swarm_map.py プロジェクト: allenlab/rRNA_pipeline
def get_swarms(fasta_file, swarm_file, cpus):
    global dict_id_swarm

    if fasta_file and not os.path.exists(swarm_file):
        print("[swarm_map] running swarm", file=sys.stderr)

        if cpus < 1:
            cpus = 1

        cmd = " ".join(
            ["swarm -f -t",
             str(cpus), "-o", swarm_file, fasta_file])

        if verbose:
            print(cmd, file=sys.stderr)
        else:
            cmd += " &>/dev/null"

        rc = os.system(cmd)
        if rc != 0:
            print("[swarm_map] ERROR: swarm", file=sys.stderr)
            sys.exit(2)

    in_handle1 = happyfile.hopen_or_else(fasta_file)
    while 1:
        line = in_handle1.readline()
        if not line:
            break
        line = line.rstrip()

        if line.startswith(">"):
            id = line[1:]
            # set any IDs not returned by swarm, to their own cluster
            dict_id_swarm[id] = id
    in_handle1.close()

    in_handle2 = happyfile.hopen_or_else(swarm_file)
    if verbose:
        print("Reading swarm file: " + swarm_file, file=sys.stderr)

    while 1:
        line = in_handle2.readline()
        if not line:
            break
        line = line.rstrip()

        id_list = re.split('\s', line)
        for id in id_list:
            dict_id_swarm[id] = id_list[0]
    in_handle2.close()
コード例 #3
0
ファイル: swarm_map.py プロジェクト: allenlab/rRNA_pipeline
def get_swarms(fasta_file, swarm_file, cpus):
    global dict_id_swarm
    
    if fasta_file and not os.path.exists(swarm_file):
        print("[swarm_map] running swarm", file=sys.stderr)

        if cpus < 1:
            cpus = 1
        
        cmd = " ".join(["swarm -f -t", str(cpus), "-o", swarm_file, fasta_file])
        
        if verbose:
            print(cmd, file=sys.stderr)
        else:
            cmd += " &>/dev/null"
        
        rc = os.system(cmd)
        if rc != 0:
            print("[swarm_map] ERROR: swarm", file=sys.stderr)
            sys.exit(2)

    in_handle1 = happyfile.hopen_or_else(fasta_file)
    while 1:
        line = in_handle1.readline()
        if not line:
            break
        line = line.rstrip()

        if line.startswith(">"):
            id = line[1:]
            # set any IDs not returned by swarm, to their own cluster
            dict_id_swarm[id] = id
    in_handle1.close()

    in_handle2 = happyfile.hopen_or_else(swarm_file)
    if verbose:
        print("Reading swarm file: " + swarm_file, file=sys.stderr)
    
    while 1:
        line = in_handle2.readline()
        if not line:
            break
        line = line.rstrip()

        id_list = re.split('\s', line)
        for id in id_list:
            dict_id_swarm[id] = id_list[0]
    in_handle2.close()
コード例 #4
0
ファイル: swarm_map.py プロジェクト: allenlab/rRNA_pipeline
def read_swarm_fasta(fasta_file):
    in_handle = happyfile.hopen_or_else(fasta_file)
    
    if verbose:
        print("Reading FASTA file: " + fasta_file, file=sys.stderr)
        
    id = ""
    seq = ""
    while 1:
        line = in_handle.readline()
        if not line:
            break
        line = line.rstrip()
        
        if line.startswith(">"):
            if seq:
                dict_swarm_seq[id] = seq
            id = line[1:]
            seq = ""
        else:
            seq += re.sub('\s', '', line)

    if seq:
        dict_swarm_seq[id] = seq
    in_handle.close()
コード例 #5
0
def filter_fastq(fastq_file, output_file, min_quality, min_seq_len, max_seq_len):
    in_handle = happyfile.hopen_or_else(fastq_file)
    
    if verbose:
        print("Reading FASTQ file: " + fastq_file, file=sys.stderr)

    out_handle = sys.stdout
    if output_file:
        out_handle = happyfile.hopen_write_or_else(output_file)

    if verbose:
        print("Writing FASTA file: " + output_file, file=sys.stderr)

    rnum = 1
    id = ""
    seq = ""
    qual = ""
    while 1:
        line = in_handle.readline()
        if not line:
            break
        line = line.rstrip()
        if rnum == 1:
            id = re.split('\s', line[1:])[0]
        elif rnum == 2:
            seq = line
        elif rnum == 4:
            qual = line
            filter_line(out_handle, id, seq, qual, min_quality, min_seq_len, max_seq_len)
        rnum += 1
        if rnum > 4:
            rnum = 1
コード例 #6
0
def read_sample_names(sample_names_file):
    global dict_sample_name
    
    if sample_names_file:
        in_handle = happyfile.hopen_or_else(sample_names_file)
    
        if verbose:
            xprint("Reading sample names file: " + sample_names_file)
        
        while 1:
            line = in_handle.readline()
            if not line:
                break
            line = line.rstrip()
            
            name, file = line.split("\t")
            if name in dict_all_sample_names:
                xprint("Duplicate sample name found: " + name)
                sys.exit(2)
            
            dict_sample_name[file] = name
            dict_all_sample_names[name] = 1
    
            m = re.search('^(.+)\.filtered\.fa$', file)
            if m:
                dict_sample_name[m.group(1)] = name
            else:
                dict_sample_name[file + ".filtered.fa"] = name

        in_handle.close()
コード例 #7
0
def read_sample_names(sample_names_file):
    global dict_sample_name

    if sample_names_file:
        in_handle = happyfile.hopen_or_else(sample_names_file)

        if verbose:
            print >> sys.stderr, "Reading sample names file: " + sample_names_file

        while 1:
            line = in_handle.readline()
            if not line:
                break
            line = line.rstrip()

            name, file = line.split("\t")
            if name in dict_all_sample_names:
                print >> sys.stderr, "Duplicate sample name found: " + name
                sys.exit(2)

            dict_sample_name[file] = name
            dict_all_sample_names[name] = 1

            m = re.search('^(.+)\.filtered\.fa$', file)
            if m:
                dict_sample_name[m.group(1)] = name
            else:
                dict_sample_name[file + ".filtered.fa"] = name

        in_handle.close()
コード例 #8
0
def filter_fastq(fastq_file, output_file, min_quality, min_seq_len,
                 max_seq_len):
    in_handle = happyfile.hopen_or_else(fastq_file)

    if verbose:
        print("Reading FASTQ file: " + fastq_file, file=sys.stderr)

    out_handle = sys.stdout
    if output_file:
        out_handle = happyfile.hopen_write_or_else(output_file)

    if verbose:
        print("Writing FASTA file: " + output_file, file=sys.stderr)

    rnum = 1
    id = ""
    seq = ""
    qual = ""
    while 1:
        line = in_handle.readline()
        if not line:
            break
        line = line.rstrip()
        if rnum == 1:
            id = re.split('\s', line[1:])[0]
        elif rnum == 2:
            seq = line
        elif rnum == 4:
            qual = line
            filter_line(out_handle, id, seq, qual, min_quality, min_seq_len,
                        max_seq_len)
        rnum += 1
        if rnum > 4:
            rnum = 1
コード例 #9
0
ファイル: purity_plot.py プロジェクト: allenlab/rRNA_pipeline
def write_swarm_content(fasta_file, swarm_content_fasta_file):
    swarm_content_size = 0
    in_handle = happyfile.hopen_or_else(fasta_file)
        
    if verbose:
        print("Reading FASTA file: " + fasta_file, file=sys.stderr)
        
    out_handle = happyfile.hopen_write_or_else(swarm_content_fasta_file)

    if verbose:
        print("Writing swarm content FASTA file: " + swarm_content_fasta_file, file=sys.stderr)
        
    write_out = False
    while 1:
        line = in_handle.readline()
        if not line:
            break
        line = line.rstrip()
        
        if line.startswith(">"):
            id = re.split('\s', line[1:])[0]
            if id in dict_derep_ids:
                write_out = True
                swarm_content_size += 1
            else:
                write_out = False

        if write_out:
            print(line, file=out_handle)
    
    in_handle.close()
    out_handle.close()

    return swarm_content_size
コード例 #10
0
def read_sample_names(sample_names_file):
    global dict_sample_name
    
    if sample_names_file:
        in_handle = happyfile.hopen_or_else(sample_names_file)
    
        if verbose:
            print >>sys.stderr, "Reading sample names file: " + sample_names_file
        
        while 1:
            line = in_handle.readline()
            if not line:
                break
            line = line.rstrip()
            
            name, file = line.split("\t")
            dict_sample_name[file] = name
    
            m = re.search('^(.+)\.filtered\.fa$', file)
            if m:
                dict_sample_name[m.group(1)] = name
            else:
                dict_sample_name[file + ".filtered.fa"] = name

        in_handle.close()
コード例 #11
0
def read_counts(counts_file):
    global dict_swarm_counts
    global dict_swarm_sample_counts
    global sample_list
    
    if counts_file:
        in_handle = happyfile.hopen_or_else(counts_file)
        
        if verbose:
            print >>sys.stderr, "Reading counts file: " + counts_file
        
        firstline = 1
        while 1:
            line = in_handle.readline()
            if not line:
                break
            line = line.rstrip()
            
            if firstline:
                sample_list = line.split("\t")[1:]
            else:
                cols = line.split("\t")
                for i in range(1, len(cols)):
                    dict_swarm_sample_counts[cols[0], i-1] = int(cols[i])
                    dict_swarm_counts[cols[0]] = dict_swarm_counts.get(cols[0], 0) + int(cols[i])
            
            firstline = 0
                
        in_handle.close()
コード例 #12
0
def write_swarm_content(fasta_file, swarm_content_fasta_file):
    swarm_content_size = 0
    in_handle = happyfile.hopen_or_else(fasta_file)
        
    if verbose:
        print >>sys.stderr, "Reading FASTA file: " + fasta_file
        
    out_handle = happyfile.hopen_write_or_else(swarm_content_fasta_file)

    if verbose:
        print >>sys.stderr, "Writing swarm content FASTA file: " + swarm_content_fasta_file
        
    write_out = False
    while 1:
        line = in_handle.readline()
        if not line:
            break
        line = line.rstrip()
        
        if line.startswith(">"):
            id = re.split('\s', line[1:])[0]
            if id in dict_derep_ids:
                write_out = True
                swarm_content_size += 1
            else:
                write_out = False

        if write_out:
            print >>out_handle, line
    
    in_handle.close()
    out_handle.close()

    return swarm_content_size
コード例 #13
0
ファイル: swarm_map.py プロジェクト: allenlab/rRNA_pipeline
def read_swarm_fasta(fasta_file):
    in_handle = happyfile.hopen_or_else(fasta_file)

    if verbose:
        print("Reading FASTA file: " + fasta_file, file=sys.stderr)

    id = ""
    seq = ""
    while 1:
        line = in_handle.readline()
        if not line:
            break
        line = line.rstrip()

        if line.startswith(">"):
            if seq:
                dict_swarm_seq[id] = seq
            id = line[1:]
            seq = ""
        else:
            seq += re.sub('\s', '', line)

    if seq:
        dict_swarm_seq[id] = seq
    in_handle.close()
コード例 #14
0
ファイル: swarm_map.py プロジェクト: allenlab/rRNA_pipeline
def read_sample_names(sample_names_file):
    global dict_sample_name

    if sample_names_file:
        in_handle = happyfile.hopen_or_else(sample_names_file)

        if verbose:
            print("Reading sample names file: " + sample_names_file,
                  file=sys.stderr)

        while 1:
            line = in_handle.readline()
            if not line:
                break
            line = line.rstrip()

            name, file = line.split("\t")
            dict_sample_name[file] = name

            m = re.search('^(.+)\.filtered\.fa$', file)
            if m:
                dict_sample_name[m.group(1)] = name
            else:
                dict_sample_name[file + ".filtered.fa"] = name

        in_handle.close()
コード例 #15
0
ファイル: swarm_map.py プロジェクト: allenlab/rRNA_pipeline
def read_counts(counts_file):
    global dict_id_counts
    global dict_id_sample_counts
    global sample_list

    if counts_file:
        in_handle = happyfile.hopen_or_else(counts_file)

        if verbose:
            print("Reading counts file: " + counts_file, file=sys.stderr)

        firstline = 1
        while 1:
            line = in_handle.readline()
            if not line:
                break
            line = line.rstrip()

            if firstline:
                sample_list = line.split("\t")[1:]
            else:
                cols = line.split("\t")
                for i in range(1, len(cols)):
                    dict_id_sample_counts[cols[0], i - 1] = int(cols[i])
                    dict_id_counts[cols[0]] = dict_id_counts.get(
                        cols[0], 0) + int(cols[i])

            firstline = 0

        in_handle.close()

        calc_swarm_counts()
コード例 #16
0
def read_taxa_counts(swarm_tax_file):
    global dict_taxa_counts
    global dict_taxa_sample_counts
    global dict_group_counts
    global dict_group_sample_counts
    global sample_list

    in_handle = happyfile.hopen_or_else(swarm_tax_file)

    if verbose:
        print >> sys.stderr, "Reading taxa counts file: " + swarm_tax_file

    firstline = 1
    while 1:
        line = in_handle.readline()
        if not line:
            break
        line = line.rstrip()

        if firstline:
            sample_list = line.split("\t")[3:]
        else:
            cols = line.split("\t")
            taxstr = cols[2]
            for i in range(3, len(cols)):
                dict_taxa_sample_counts[taxstr,
                                        i - 3] = dict_taxa_sample_counts.get(
                                            (taxstr, i - 3), 0) + int(cols[i])
                dict_taxa_counts[taxstr] = dict_taxa_counts.get(
                    taxstr, 0) + int(cols[i])

        firstline = 0

    in_handle.close()

    for id_tax in dict_taxa_counts:
        best_grp_tax = ""
        for grp_tax in dict_taxonomy_group:
            sub_tax = id_tax[:len(grp_tax)]
            if sub_tax == grp_tax:
                if (not best_grp_tax) or len(grp_tax) > len(best_grp_tax):
                    best_grp_tax = grp_tax

        best_grp_name = dict_taxonomy_group.get(best_grp_tax, "Unclassified")

        for i in range(len(sample_list)):
            dict_group_sample_counts[best_grp_name,
                                     i] = dict_group_sample_counts.get(
                                         (best_grp_name, i),
                                         0) + dict_taxa_sample_counts.get(
                                             (id_tax, i), 0)
            dict_group_counts[best_grp_name] = dict_group_counts.get(
                best_grp_name, 0) + dict_taxa_counts.get(id_tax, 0)
コード例 #17
0
def read_index_file(index_file):
    global dict_is_index

    in_handle = happyfile.hopen_or_else(index_file)
    xprint_err("reading file: " + index_file)

    while 1:
        line = in_handle.readline()
        if not line:
            break
        line = line.rstrip()
        cols = line.split('\t')
        dict_is_index[cols[0]] = True
        xprint_err("found: " + cols[0])
コード例 #18
0
ファイル: purity_plot.py プロジェクト: allenlab/rRNA_pipeline
def read_swarms(swarm_file):
    in_handle = happyfile.hopen_or_else(swarm_file)
    if verbose:
        print("Reading swarm file: " + swarm_file, file=sys.stderr)
    
    while 1:
        line = in_handle.readline()
        if not line:
            break
        line = line.rstrip()
        
        id_list = re.split('\s', line)
        for id in id_list:
            dict_id_swarm[id] = id_list[0]
    in_handle.close()
コード例 #19
0
def read_swarms(swarm_file):
    in_handle = happyfile.hopen_or_else(swarm_file)
    if verbose:
        print >>sys.stderr, "Reading swarm file: " + swarm_file
    
    while 1:
        line = in_handle.readline()
        if not line:
            break
        line = line.rstrip()
        
        id_list = re.split('\s', line)
        for id in id_list:
            dict_id_swarm[id] = id_list[0]
    in_handle.close()
コード例 #20
0
def read_swarm_counts(swarm_counts_file, min_swarm_count, top_swarms):
    dict_swarm_counts = {}
    global dict_derep_ids

    in_handle = happyfile.hopen_or_else(swarm_counts_file)

    if verbose:
        print("Reading swarm counts file: " + swarm_counts_file,
              file=sys.stderr)

    firstline = 1
    while 1:
        line = in_handle.readline()
        if not line:
            break
        line = line.rstrip()

        if not firstline:
            cols = line.split("\t")
            for i in range(1, len(cols)):
                dict_swarm_counts[cols[0]] = dict_swarm_counts.get(
                    cols[0], 0) + int(cols[i])

        firstline = 0

    in_handle.close()

    num_ids = 0
    dict_top_swarms = {}
    for swarm_id in sorted(dict_swarm_counts,
                           key=dict_swarm_counts.get,
                           reverse=True):
        if num_ids < top_swarms:
            dict_top_swarms[swarm_id] = 1
            num_ids += 1

    for id in dict_id_swarm:
        swarm_id = dict_id_swarm[id]
        if swarm_id in dict_top_swarms and dict_swarm_counts.get(
                swarm_id, 0) >= min_swarm_count:
            dict_derep_ids[id] = 1

    if verbose:
        print("Top purity content, swarms: " + str(len(dict_top_swarms)) +
              " derep ids: " + str(len(dict_derep_ids)),
              file=sys.stderr)
コード例 #21
0
def read_chimeras(chimera_file):
    global dict_chimera_ids

    in_handle = happyfile.hopen_or_else(chimera_file)
        
    if verbose:
        print("Reading chimera file: " + chimera_file, file=sys.stderr)

    while 1:
        line = in_handle.readline()
        if not line:
            break
        line = line.rstrip()

        cols = line.split('\t')
        if cols[16] == 'Y':
            dict_chimera_ids[cols[1]] = 1
コード例 #22
0
def read_chimeras(chimera_file):
    global dict_chimera_ids

    in_handle = happyfile.hopen_or_else(chimera_file)

    if verbose:
        print("Reading chimera file: " + chimera_file, file=sys.stderr)

    while 1:
        line = in_handle.readline()
        if not line:
            break
        line = line.rstrip()

        cols = line.split('\t')
        if cols[16] == 'Y':
            dict_chimera_ids[cols[1]] = 1
コード例 #23
0
ファイル: group_taxa.py プロジェクト: mccrowjp/rRNA_pipeline
def read_taxa_counts(swarm_tax_file):
    global dict_taxa_counts
    global dict_taxa_sample_counts
    global dict_group_counts
    global dict_group_sample_counts
    global sample_list
    
    in_handle = happyfile.hopen_or_else(swarm_tax_file)
    
    if verbose:
        xprint("Reading taxa counts file: " + swarm_tax_file)
    
    firstline = 1
    while 1:
        line = in_handle.readline()
        if not line:
            break
        line = line.rstrip()
        
        if firstline:
            sample_list = line.split("\t")[3:]
        else:
            cols = line.split("\t")
            taxstr = cols[2]
            for i in range(3, len(cols)):
                dict_taxa_sample_counts[taxstr, i-3] = dict_taxa_sample_counts.get((taxstr, i-3), 0) + int(cols[i])
                dict_taxa_counts[taxstr] = dict_taxa_counts.get(taxstr, 0) + int(cols[i])
        
        firstline = 0
            
    in_handle.close()

    for id_tax in dict_taxa_counts:
        best_grp_tax = ""
        for grp_tax in dict_taxonomy_group:
            sub_tax = id_tax[:len(grp_tax)]
            if sub_tax == grp_tax:
                if (not best_grp_tax) or len(grp_tax) > len(best_grp_tax):
                    best_grp_tax = grp_tax
    
        best_grp_name = dict_taxonomy_group.get(best_grp_tax, "Unclassified")

        for i in range(len(sample_list)):
            dict_group_sample_counts[best_grp_name, i] = dict_group_sample_counts.get((best_grp_name, i), 0) + dict_taxa_sample_counts.get((id_tax, i), 0)
            dict_group_counts[best_grp_name] = dict_group_counts.get(best_grp_name, 0) + dict_taxa_counts.get(id_tax, 0)
コード例 #24
0
ファイル: group_taxa.py プロジェクト: mccrowjp/rRNA_pipeline
def read_groups(taxa_group_file):
    global dict_taxonomy_group
    
    in_handle = happyfile.hopen_or_else(taxa_group_file)

    if verbose:
        xprint("Reading taxa groups file: " + taxa_group_file)
    
    while 1:
        line = in_handle.readline()
        if not line:
            break
        line = line.rstrip()
        
        if line:
            group_name, taxstr = line.split("\t")[:2]
            if taxstr and group_name:
                dict_taxonomy_group[taxstr] = group_name

    in_handle.close()
コード例 #25
0
def read_groups(taxa_group_file):
    global dict_taxonomy_group

    in_handle = happyfile.hopen_or_else(taxa_group_file)

    if verbose:
        print >> sys.stderr, "Reading taxa groups file: " + taxa_group_file

    while 1:
        line = in_handle.readline()
        if not line:
            break
        line = line.rstrip()

        if line:
            group_name, taxstr = line.split("\t")[:2]
            if taxstr and group_name:
                dict_taxonomy_group[taxstr] = group_name

    in_handle.close()
コード例 #26
0
def derep_fasta(fasta_files, min_fasta):
    global good_fasta_files
    filenum = 0

    for fasta_file in fasta_files:
        total_seqs = 0
        in_handle = happyfile.hopen_or_else(fasta_file)

        if verbose:
            print("Reading FASTA file: " + fasta_file, file=sys.stderr)

        id = ""
        seq = ""
        while 1:
            line = in_handle.readline()
            if not line:
                break
            line = line.rstrip()

            if line.startswith(">"):
                total_seqs += 1
                derep_line(id, seq, filenum)
                id = line[1:]
                seq = ""
            else:
                seq += re.sub('\s', '', line)
        derep_line(id, seq, filenum)
        in_handle.close()

        # Remove counts for this file if below minimum
        if total_seqs < min_fasta:
            print("[fasta_dereplicate] Excluding: " + fasta_file,
                  file=sys.stderr)
            for key in dict_id_counts:
                dict_id_counts[key] -= dict_id_file_counts.get((key, filenum),
                                                               0)
                dict_id_file_counts[key, filenum] = 0
        else:
            good_fasta_files.append(fasta_file)
            filenum += 1
コード例 #27
0
ファイル: purity_plot.py プロジェクト: allenlab/rRNA_pipeline
def read_swarm_counts(swarm_counts_file, min_swarm_count, top_swarms):
    dict_swarm_counts = {}
    global dict_derep_ids

    in_handle = happyfile.hopen_or_else(swarm_counts_file)
    
    if verbose:
        print("Reading swarm counts file: " + swarm_counts_file, file=sys.stderr)
        
    firstline = 1
    while 1:
        line = in_handle.readline()
        if not line:
            break
        line = line.rstrip()
        
        if not firstline:
            cols = line.split("\t")
            for i in range(1, len(cols)):
                dict_swarm_counts[cols[0]] = dict_swarm_counts.get(cols[0], 0) + int(cols[i])
        
        firstline = 0

    in_handle.close()

    num_ids = 0
    dict_top_swarms = {}
    for swarm_id in sorted(dict_swarm_counts, key=dict_swarm_counts.get, reverse=True):
        if num_ids < top_swarms:
            dict_top_swarms[swarm_id] = 1
            num_ids += 1

    for id in dict_id_swarm:
        swarm_id = dict_id_swarm[id]
        if swarm_id in dict_top_swarms and dict_swarm_counts.get(swarm_id, 0) >= min_swarm_count:
            dict_derep_ids[id] = 1

    if verbose:
        print("Top purity content, swarms: " + str(len(dict_top_swarms)) + " derep ids: " + str(len(dict_derep_ids)), file=sys.stderr)
コード例 #28
0
def derep_fasta(fasta_files, min_fasta):
    global good_fasta_files
    filenum = 0
    
    for fasta_file in fasta_files:
        total_seqs = 0
        in_handle = happyfile.hopen_or_else(fasta_file)
        
        if verbose:
            xprint("Reading FASTA file: " + fasta_file)

        id = ""
        seq = ""
        while 1:
            line = in_handle.readline()
            if not line:
                break
            line = line.rstrip()

            if line.startswith(">"):
                total_seqs += 1
                derep_line(id, seq, filenum)
                id = line[1:]
                seq = ""
            else:
                seq += re.sub('\s', '', line)
        derep_line(id, seq, filenum)
        in_handle.close()
        
        # Remove counts for this file if below minimum
        if total_seqs < min_fasta:
            xprint("[fasta_dereplicate] Excluding: " + fasta_file)
            for key in dict_id_counts:
                dict_id_counts[key] -= dict_id_file_counts.get((key, filenum), 0)
                dict_id_file_counts[key, filenum] = 0
        else:
            good_fasta_files.append(fasta_file)
            filenum += 1
コード例 #29
0
def read_fastq(fastq_file):
    global dict_index_count
    global count_total
    global count_index

    in_handle = happyfile.hopen_or_else(fastq_file)
    xprint_err("reading file: " + fastq_file)

    rnum = 1
    while 1:
        line = in_handle.readline()
        if not line:
            break
        line = line.rstrip()
        if rnum == 2:
            idx = line[4:12]
            dict_index_count[idx] = dict_index_count.get(idx, 0) + 1
            count_total += 1
            if dict_is_index.get(idx, False):
                count_index += 1
        rnum += 1
        if rnum > 4:
            rnum = 1
コード例 #30
0
def get_taxonomy(swarm_content_fasta_file, swarm_content_ggsearch_file, database_file, cpus):
    global dict_id_best_hit
    global dict_id_best_bs
    global dict_id_taxonomy

    if swarm_content_fasta_file:
        if os.path.exists(swarm_content_ggsearch_file):
            if verbose:
                print >>sys.stderr, "Ignoring content FASTA file: " + swarm_content_fasta_file
        else:
            print >>sys.stderr, "[purity_plot] running ggsearch"
            
            if cpus < 1:
                cpus = 1
            
            cmd = " ".join(["glsearch36 -b 1 -m 8 -T", str(cpus), swarm_content_fasta_file, database_file, ">", swarm_content_ggsearch_file])
            
            if verbose:
                print >>sys.stderr, cmd
            else:
                cmd += " 2>/dev/null"
            
            rc = os.system(cmd)
            if rc != 0:
                print >>sys.stderr, "[purity_plot] ERROR: ggsearch"
                sys.exit(2)

    in_handle1 = happyfile.hopen_or_else(swarm_content_ggsearch_file)
    if verbose:
        print >>sys.stderr, "Reading ggsearch file: " + swarm_content_ggsearch_file
    
    while 1:
        line = in_handle1.readline()
        if not line:
            break
        line = line.rstrip()

        qid, sid, pid, alen, mm, go, qs, qe, ss, se, e, bs = line.split("\t")[:12]
        if (not qid in dict_id_best_bs) or bs > dict_id_best_bs[qid]:
            dict_id_best_hit[qid] = sid
            dict_id_best_bs[qid] = bs
    in_handle1.close()

    in_handle2 = happyfile.hopen_or_else(database_file)
    if verbose:
        print >>sys.stderr, "Reading database file: " + database_file
        
    while 1:
        line = in_handle2.readline()
        if not line:
            break
        line = line.rstrip()
        
        if line.startswith(">"):
            m = re.match('>(\S+)\s+(.+)$', line)
            if m:
                id = m.group(1)
                taxstr = m.group(2)
                taxstr = re.sub('\|', ';', taxstr)
                taxstr = re.sub('\+', ' ', taxstr)
                dict_id_taxonomy[id] = taxstr
    in_handle2.close()
コード例 #31
0
def remove_plastid_seqs(output_base_file):
    dict_plastid = {}
    swarm_tax = output_base_file + ".swarm.tax"

    derep_fa = output_base_file + ".derep.fa"
    derep_counts = output_base_file + ".derep.counts"
    swarm_table = output_base_file + ".swarm"
    swarm_fa = output_base_file + ".swarm.fa"
    swarm_counts = output_base_file + ".swarm.counts"
    
    derep_plastid_fa = output_base_file + ".plastid.derep.fa"
    derep_plastid_counts = output_base_file + ".plastid.derep.counts"
    swarm_plastid_table = output_base_file + ".plastid.swarm"
    swarm_plastid_fa = output_base_file + ".plastid.swarm.fa"
    swarm_plastid_counts = output_base_file + ".plastid.swarm.counts"

    tmp_derep_16S_fa = derep_fa + ".tmp"
    tmp_derep_16S_counts = derep_counts + ".tmp"
    tmp_swarm_16S_table = swarm_table + ".tmp"
    tmp_swarm_16S_fa = swarm_fa + ".tmp"
    tmp_swarm_16S_counts = swarm_counts + ".tmp"
    tmp_swarm_16S_tax = swarm_tax + ".tmp"

    if overwrite or not os.path.exists(swarm_plastid_fa):
        if verbose:
            print >>sys.stderr, "Filtering chloroplast sequences"

        # split 16S/Plastid swarm taxonomy table
        in_handle_tax = happyfile.hopen_or_else(swarm_tax)
        out_handle_16S_tax = happyfile.hopen_write_or_else(tmp_swarm_16S_tax)

        firstline = 1
        while 1:
            line = in_handle_tax.readline()
            if not line:
                break
            line = line.rstrip()
            cols = line.split('\t')

            if firstline:
                print >>out_handle_16S_tax, line
            else:
                m = re.match('Bacteria;Cyanobacteria;Chloroplast', cols[2])
                if m:
                    dict_plastid[cols[0]] = 1
                else:
                    print >>out_handle_16S_tax, line

            firstline = 0

        in_handle_tax.close()
        out_handle_16S_tax.close()

        # split 16S/Plastid swarm file
        in_handle_table = happyfile.hopen_or_else(swarm_table)
        out_handle_16S_table = happyfile.hopen_write_or_else(tmp_swarm_16S_table)
        out_handle_plastid_table = happyfile.hopen_write_or_else(swarm_plastid_table)
        
        while 1:
            line = in_handle_table.readline()
            if not line:
                break
            line = line.rstrip()
            id_list = re.split('\s', line)
            swarm_id = id_list[0]
            
            if swarm_id in dict_plastid:
                print >>out_handle_plastid_table, line
                for id in id_list:
                    dict_plastid[id] = 1
            else:
                print >>out_handle_16S_table, line

        in_handle_table.close()
        out_handle_16S_table.close()
        out_handle_plastid_table.close()

        # split 16S/Plastid derep FASTA
        in_handle_derep_fa = happyfile.hopen_or_else(derep_fa)
        out_handle_16S_derep_fa = happyfile.hopen_write_or_else(tmp_derep_16S_fa)
        out_handle_plastid_derep_fa = happyfile.hopen_write_or_else(derep_plastid_fa)
        
        id = ""
        while 1:
            line = in_handle_derep_fa.readline()
            if not line:
                break
            line = line.rstrip()
            
            if line.startswith(">"):
                id = re.split('\s', line[1:])[0]
            
            if id:
                if id in dict_plastid:
                    print >>out_handle_plastid_derep_fa, line
                else:
                    print >>out_handle_16S_derep_fa, line

        in_handle_derep_fa.close()
        out_handle_16S_derep_fa.close()
        out_handle_plastid_derep_fa.close()
        
        # split 16S/Plastid derep counts table
        in_handle_derep_counts = happyfile.hopen_or_else(derep_counts)
        out_handle_16S_derep_counts = happyfile.hopen_write_or_else(tmp_derep_16S_counts)
        out_handle_plastid_derep_counts = happyfile.hopen_write_or_else(derep_plastid_counts)
        
        firstline = 1
        while 1:
            line = in_handle_derep_counts.readline()
            if not line:
                break
            line = line.rstrip()
            cols = line.split('\t')
            
            if firstline:
                print >>out_handle_16S_derep_counts, line
                print >>out_handle_plastid_derep_counts, line
            else:
                if cols[0] in dict_plastid:
                    print >>out_handle_plastid_derep_counts, line
                else:
                    print >>out_handle_16S_derep_counts, line

            firstline = 0

        in_handle_derep_counts.close()
        out_handle_16S_derep_counts.close()
        out_handle_plastid_derep_counts.close()

        # split 16S/Plastid swarm FASTA
        in_handle_fa = happyfile.hopen_or_else(swarm_fa)
        out_handle_16S_fa = happyfile.hopen_write_or_else(tmp_swarm_16S_fa)
        out_handle_plastid_fa = happyfile.hopen_write_or_else(swarm_plastid_fa)

        id = ""
        while 1:
            line = in_handle_fa.readline()
            if not line:
                break
            line = line.rstrip()
            
            if line.startswith(">"):
                id = re.split('\s', line[1:])[0]

            if id:
                if id in dict_plastid:
                    print >>out_handle_plastid_fa, line
                else:
                    print >>out_handle_16S_fa, line

        in_handle_fa.close()
        out_handle_16S_fa.close()
        out_handle_plastid_fa.close()

        # split 16S/Plastid swarm counts table
        in_handle_counts = happyfile.hopen_or_else(swarm_counts)
        out_handle_16S_counts = happyfile.hopen_write_or_else(tmp_swarm_16S_counts)
        out_handle_plastid_counts = happyfile.hopen_write_or_else(swarm_plastid_counts)

        firstline = 1
        while 1:
            line = in_handle_counts.readline()
            if not line:
                break
            line = line.rstrip()
            cols = line.split('\t')

            if firstline:
                print >>out_handle_16S_counts, line
                print >>out_handle_plastid_counts, line
            else:
                if cols[0] in dict_plastid:
                    print >>out_handle_plastid_counts, line
                else:
                    print >>out_handle_16S_counts, line

            firstline = 0
        
        in_handle_counts.close()
        out_handle_16S_counts.close()
        out_handle_plastid_counts.close()

        # replace original swarm files with 16S only
        if os.path.exists(tmp_derep_16S_fa) and os.path.exists(tmp_derep_16S_counts) and os.path.exists(tmp_swarm_16S_table) and os.path.exists(tmp_swarm_16S_tax) and os.path.exists(tmp_swarm_16S_fa) and os.path.exists(tmp_swarm_16S_counts):
            replace_file(tmp_derep_16S_fa, derep_fa)
            replace_file(tmp_derep_16S_counts, derep_counts)
            replace_file(tmp_swarm_16S_table, swarm_table)
            replace_file(tmp_swarm_16S_tax, swarm_tax)
            replace_file(tmp_swarm_16S_fa, swarm_fa)
            replace_file(tmp_swarm_16S_counts, swarm_counts)
        else:
            print >>sys.stderr, "Not all tmp_ files found"
            sys.exit(2)
コード例 #32
0
def remove_plastid_seqs(output_base_file):
    dict_plastid = {}
    swarm_tax = output_base_file + ".swarm.tax"

    derep_fa = output_base_file + ".derep.fa"
    derep_counts = output_base_file + ".derep.counts"
    swarm_table = output_base_file + ".swarm"
    swarm_fa = output_base_file + ".swarm.fa"
    swarm_counts = output_base_file + ".swarm.counts"

    derep_plastid_fa = output_base_file + ".plastid.derep.fa"
    derep_plastid_counts = output_base_file + ".plastid.derep.counts"
    swarm_plastid_table = output_base_file + ".plastid.swarm"
    swarm_plastid_fa = output_base_file + ".plastid.swarm.fa"
    swarm_plastid_counts = output_base_file + ".plastid.swarm.counts"

    tmp_derep_16S_fa = derep_fa + ".tmp"
    tmp_derep_16S_counts = derep_counts + ".tmp"
    tmp_swarm_16S_table = swarm_table + ".tmp"
    tmp_swarm_16S_fa = swarm_fa + ".tmp"
    tmp_swarm_16S_counts = swarm_counts + ".tmp"
    tmp_swarm_16S_tax = swarm_tax + ".tmp"

    if overwrite or not os.path.exists(swarm_plastid_fa):
        if verbose:
            print >> sys.stderr, "Filtering chloroplast sequences"

        # split 16S/Plastid swarm taxonomy table
        in_handle_tax = happyfile.hopen_or_else(swarm_tax)
        out_handle_16S_tax = happyfile.hopen_write_or_else(tmp_swarm_16S_tax)

        firstline = 1
        while 1:
            line = in_handle_tax.readline()
            if not line:
                break
            line = line.rstrip()
            cols = line.split('\t')

            if firstline:
                print >> out_handle_16S_tax, line
            else:
                m = re.match('Bacteria;Cyanobacteria;Chloroplast', cols[2])
                if m:
                    dict_plastid[cols[0]] = 1
                else:
                    print >> out_handle_16S_tax, line

            firstline = 0

        in_handle_tax.close()
        out_handle_16S_tax.close()

        # split 16S/Plastid swarm file
        in_handle_table = happyfile.hopen_or_else(swarm_table)
        out_handle_16S_table = happyfile.hopen_write_or_else(
            tmp_swarm_16S_table)
        out_handle_plastid_table = happyfile.hopen_write_or_else(
            swarm_plastid_table)

        while 1:
            line = in_handle_table.readline()
            if not line:
                break
            line = line.rstrip()
            id_list = re.split('\s', line)
            swarm_id = id_list[0]

            if swarm_id in dict_plastid:
                print >> out_handle_plastid_table, line
                for id in id_list:
                    dict_plastid[id] = 1
            else:
                print >> out_handle_16S_table, line

        in_handle_table.close()
        out_handle_16S_table.close()
        out_handle_plastid_table.close()

        # split 16S/Plastid derep FASTA
        in_handle_derep_fa = happyfile.hopen_or_else(derep_fa)
        out_handle_16S_derep_fa = happyfile.hopen_write_or_else(
            tmp_derep_16S_fa)
        out_handle_plastid_derep_fa = happyfile.hopen_write_or_else(
            derep_plastid_fa)

        id = ""
        while 1:
            line = in_handle_derep_fa.readline()
            if not line:
                break
            line = line.rstrip()

            if line.startswith(">"):
                id = re.split('\s', line[1:])[0]

            if id:
                if id in dict_plastid:
                    print >> out_handle_plastid_derep_fa, line
                else:
                    print >> out_handle_16S_derep_fa, line

        in_handle_derep_fa.close()
        out_handle_16S_derep_fa.close()
        out_handle_plastid_derep_fa.close()

        # split 16S/Plastid derep counts table
        in_handle_derep_counts = happyfile.hopen_or_else(derep_counts)
        out_handle_16S_derep_counts = happyfile.hopen_write_or_else(
            tmp_derep_16S_counts)
        out_handle_plastid_derep_counts = happyfile.hopen_write_or_else(
            derep_plastid_counts)

        firstline = 1
        while 1:
            line = in_handle_derep_counts.readline()
            if not line:
                break
            line = line.rstrip()
            cols = line.split('\t')

            if firstline:
                print >> out_handle_16S_derep_counts, line
                print >> out_handle_plastid_derep_counts, line
            else:
                if cols[0] in dict_plastid:
                    print >> out_handle_plastid_derep_counts, line
                else:
                    print >> out_handle_16S_derep_counts, line

            firstline = 0

        in_handle_derep_counts.close()
        out_handle_16S_derep_counts.close()
        out_handle_plastid_derep_counts.close()

        # split 16S/Plastid swarm FASTA
        in_handle_fa = happyfile.hopen_or_else(swarm_fa)
        out_handle_16S_fa = happyfile.hopen_write_or_else(tmp_swarm_16S_fa)
        out_handle_plastid_fa = happyfile.hopen_write_or_else(swarm_plastid_fa)

        id = ""
        while 1:
            line = in_handle_fa.readline()
            if not line:
                break
            line = line.rstrip()

            if line.startswith(">"):
                id = re.split('\s', line[1:])[0]

            if id:
                if id in dict_plastid:
                    print >> out_handle_plastid_fa, line
                else:
                    print >> out_handle_16S_fa, line

        in_handle_fa.close()
        out_handle_16S_fa.close()
        out_handle_plastid_fa.close()

        # split 16S/Plastid swarm counts table
        in_handle_counts = happyfile.hopen_or_else(swarm_counts)
        out_handle_16S_counts = happyfile.hopen_write_or_else(
            tmp_swarm_16S_counts)
        out_handle_plastid_counts = happyfile.hopen_write_or_else(
            swarm_plastid_counts)

        firstline = 1
        while 1:
            line = in_handle_counts.readline()
            if not line:
                break
            line = line.rstrip()
            cols = line.split('\t')

            if firstline:
                print >> out_handle_16S_counts, line
                print >> out_handle_plastid_counts, line
            else:
                if cols[0] in dict_plastid:
                    print >> out_handle_plastid_counts, line
                else:
                    print >> out_handle_16S_counts, line

            firstline = 0

        in_handle_counts.close()
        out_handle_16S_counts.close()
        out_handle_plastid_counts.close()

        # replace original swarm files with 16S only
        if os.path.exists(tmp_derep_16S_fa) and os.path.exists(
                tmp_derep_16S_counts
        ) and os.path.exists(tmp_swarm_16S_table) and os.path.exists(
                tmp_swarm_16S_tax) and os.path.exists(
                    tmp_swarm_16S_fa) and os.path.exists(tmp_swarm_16S_counts):
            replace_file(tmp_derep_16S_fa, derep_fa)
            replace_file(tmp_derep_16S_counts, derep_counts)
            replace_file(tmp_swarm_16S_table, swarm_table)
            replace_file(tmp_swarm_16S_tax, swarm_tax)
            replace_file(tmp_swarm_16S_fa, swarm_fa)
            replace_file(tmp_swarm_16S_counts, swarm_counts)
        else:
            print >> sys.stderr, "Not all tmp_ files found"
            sys.exit(2)
コード例 #33
0
ファイル: purity_plot.py プロジェクト: allenlab/rRNA_pipeline
def get_taxonomy(swarm_content_fasta_file, swarm_content_ggsearch_file, database_file, cpus):
    global dict_id_best_hit
    global dict_id_best_bs
    global dict_id_taxonomy

    if swarm_content_fasta_file:
        if os.path.exists(swarm_content_ggsearch_file):
            if verbose:
                print("Ignoring content FASTA file: " + swarm_content_fasta_file, file=sys.stderr)
        else:
            print("[purity_plot] running ggsearch", file=sys.stderr)
            
            if cpus < 1:
                cpus = 1
            
            cmd = " ".join(["glsearch36 -b 1 -m 8 -T", str(cpus), swarm_content_fasta_file, database_file, ">", swarm_content_ggsearch_file])
            
            if verbose:
                print(cmd, file=sys.stderr)
            else:
                cmd += " 2>/dev/null"
            
            rc = os.system(cmd)
            if rc != 0:
                print("[purity_plot] ERROR: ggsearch", file=sys.stderr)
                sys.exit(2)

    in_handle1 = happyfile.hopen_or_else(swarm_content_ggsearch_file)
    if verbose:
        print("Reading ggsearch file: " + swarm_content_ggsearch_file, file=sys.stderr)
    
    while 1:
        line = in_handle1.readline()
        if not line:
            break
        line = line.rstrip()

        qid, sid, pid, alen, mm, go, qs, qe, ss, se, e, bs = line.split("\t")[:12]
        if (not qid in dict_id_best_bs) or bs > dict_id_best_bs[qid]:
            dict_id_best_hit[qid] = sid
            dict_id_best_bs[qid] = bs
    in_handle1.close()

    in_handle2 = happyfile.hopen_or_else(database_file)
    if verbose:
        print("Reading database file: " + database_file, file=sys.stderr)
        
    while 1:
        line = in_handle2.readline()
        if not line:
            break
        line = line.rstrip()
        
        if line.startswith(">"):
            m = re.match('>(\S+)\s+(.+)$', line)
            if m:
                id = m.group(1)
                taxstr = m.group(2)
                taxstr = re.sub('\|', ';', taxstr)
                taxstr = re.sub('\+', ' ', taxstr)
                dict_id_taxonomy[id] = taxstr
    in_handle2.close()
コード例 #34
0
def get_taxonomy(fasta_file, ggsearch_file, database_file, cpus):
    global dict_swarm_best_hit
    global dict_swarm_best_bs
    global dict_id_taxonomy

    if fasta_file:
        if os.path.exists(ggsearch_file):
            if verbose:
                print("Ignoring FASTA file: " + fasta_file, file=sys.stderr)
        else:
            print("[swarm_classify_taxonomy] running ggsearch",
                  file=sys.stderr)

            if cpus < 1:
                cpus = 1

            cmd = " ".join([
                "glsearch36 -b 1 -m 8 -T",
                str(cpus), fasta_file, database_file, ">", ggsearch_file
            ])

            if verbose:
                print(cmd, file=sys.stderr)

            rc = os.system(cmd)
            if rc != 0:
                print("[swarm_classify_taxonomy] ERROR: ggsearch",
                      file=sys.stderr)
                sys.exit(2)

    in_handle1 = happyfile.hopen_or_else(ggsearch_file)
    if verbose:
        print("Reading ggsearch file: " + ggsearch_file, file=sys.stderr)

    while 1:
        line = in_handle1.readline()
        if not line:
            break
        line = line.rstrip()

        qid, sid, pid, alen, mm, go, qs, qe, ss, se, e, bs = line.split(
            "\t")[:12]
        if (not qid in dict_swarm_best_bs) or bs > dict_swarm_best_bs[qid]:
            dict_swarm_best_hit[qid] = sid
            dict_swarm_best_bs[qid] = bs
    in_handle1.close()

    in_handle2 = happyfile.hopen_or_else(database_file)
    if verbose:
        print("Reading database file: " + database_file, file=sys.stderr)

    while 1:
        line = in_handle2.readline()
        if not line:
            break
        line = line.rstrip()

        if line.startswith(">"):
            m = re.match('>(\S+)\s+(.+)$', line)
            if m:
                id = m.group(1)
                taxstr = m.group(2)
                taxstr = re.sub('\|', ';', taxstr)
                taxstr = re.sub('\+', ' ', taxstr)
                dict_id_taxonomy[id] = taxstr
    in_handle2.close()