Esempio n. 1
0
 def output_counts(end_function, count_file):
     # if the count_file already exists, skip
     outdir = os.path.join(config["dir"]["results"], stage)
     safe_makedir(outdir)
     count_file = os.path.join(outdir, count_file)
     if os.path.exists(count_file):
         return count_file
     # outputs a tab file of the counts at the end
     # of the fastq files kj
     counts = [reduce(count_ends,
                      apply_seqio(x, end_function, kind="fastq"),
                      {}) for x in curr_files]
     df = pd.DataFrame(counts,
                       index=map(_short_name, curr_files))
     df = df.astype(float)
     total = df.sum(axis=1)
     df = df.div(total, axis=0)
     df["total"] = total
     df.to_csv(count_file, sep="\t")
Esempio n. 2
0
def _trim(fasta_file, input_files):
    # make dictionary of all sequence ids in them
    # if a sequence file isnt in the dictionary add it and its begin/end/length
    # if it is already and the length in the dictionary is shorter than the
    d = _make_length_dict(input_files)

    def trim_function(x, d):
        new_seq = x
        entry = d[x.id]
        start = int(entry["qstart"])
        end = int(entry["qend"])
        new_seq.seq = x.seq[start:end]
        return new_seq

    out_file = append_stem(fasta_file, "trimmed")
    out_handle = open(out_file, "w")

    def output_writer(x):
        return SeqIO.write(x, out_handle, "fasta")

    map(output_writer,
        fasta.apply_seqio(fasta_file, partial(trim_function, d=d), "fasta"))

    return out_file
Esempio n. 3
0
def _make_combined_csv(fasta_file, input_files, org_names, out_file=None):
    """
    takes a list of output files from blastn and a fasta file and attaches
    the columns jagesh wants all into one big data frame and writes it out
    """
    suffixes = map(lambda x: "_" + x, org_names)

    # columns to keep
    TO_KEEP = list(flatten(["qseqid", map(lambda x: "sseqid" + x, suffixes),
                            map(lambda x: "evalue" + x, suffixes),
                            map(lambda x: "length" + x, suffixes),
                            map(lambda x: "pident" + x, suffixes),
                            map(lambda x: "sstart" + x, suffixes),
                            map(lambda x: "send" + x, suffixes)]))

    # read inputs as tables and merge into one big table
    inputs = zip(suffixes, map(pd.read_table, input_files))
    dfs = [inp[1].rename(columns=lambda name: name + inp[0]) for inp in inputs]
    d = {}
    for x in suffixes:
        d["qseqid" + x] = "qseqid"
    renamed = [x.rename(columns=d) for x in dfs]

    merged = reduce(lambda x, y: pd.merge(x, y, on="qseqid"),
                    renamed[1:], renamed[0])

    df_subset = merged[TO_KEEP]

    # add the sequence from the fasta file
    seqs = pd.DataFrame(fasta.apply_seqio(
        fasta_file, lambda x: {'qseqid': x.id, 'seq': str(x.seq)},
        "fasta"))

    merged = pd.merge(df_subset, seqs, on="qseqid")
    merged.to_csv(out_file, index=False, sep="\t")
    return out_file
Esempio n. 4
0
def _make_combined_csv(fasta_file, input_files, org_names, out_file=None):
    """
    takes a list of output files from blastn and a fasta file and attaches
    the columns jagesh wants all into one big data frame and writes it out
    """
    suffixes = map(lambda x: "_" + x, org_names)

    # columns to keep
    TO_KEEP = list(flatten(["qseqid", map(lambda x: "sseqid" + x, suffixes),
                            map(lambda x: "evalue" + x, suffixes),
                            map(lambda x: "length" + x, suffixes),
                            map(lambda x: "pident" + x, suffixes),
                            map(lambda x: "sstart" + x, suffixes),
                            map(lambda x: "send" + x, suffixes)]))

    # read inputs as tables and merge into one big table
    inputs = zip(suffixes, map(pd.read_table, input_files))
    dfs = [inp[1].rename(columns=lambda name: name + inp[0]) for inp in inputs]
    d = {}
    for x in suffixes:
        d["qseqid" + x] = "qseqid"
    renamed = [x.rename(columns=d) for x in dfs]

    merged = reduce(lambda x, y: pd.merge(x, y, on="qseqid"),
                    renamed[1:], renamed[0])

    df_subset = merged[TO_KEEP]

    # add the sequence from the fasta file
    seqs = pd.DataFrame(fasta.apply_seqio(
        fasta_file, lambda x: {'qseqid': x.id, 'seq': str(x.seq)},
        "fasta"))

    merged = pd.merge(df_subset, seqs, on="qseqid")
    merged.to_csv(out_file, index=False, sep="\t")
    return out_file