def extract_sequences(fastafile, hits, addhitname=False):
    """
    Extract the sequences from a fasta file

    :param fastafile: The fasta file to get the sequences from
    :type fastafile: str
    :param hits: The dict of hits using contig, start, end
    :type hits: dict
    :return: A dict of the sequences with contig_start_end as ID and sequence as value
    :rtype: dict
    """

    sequences = {}
    if not os.path.exists(fastafile):
        sys.exit("{} not found\n".format(fastafile))

    fa = roblib.read_fasta(fastafile)

    for contig in hits:
        if contig not in fa:
            sys.stderr.write("WARNING: {} was not found in {}\n".format(
                contig, fastafile))

        for tple in hits[contig]:
            seq = fa[contig][tple[0]:tple[1]]
            if tple[2]:
                seq = roblib.rc(seq)
            loc = "_".join(map(str, [contig, tple[0] + 1, tple[1]]))
            if addhitname:
                loc += " [hit={}]".format(tple[3])
            sequences[loc] = seq
    return sequences
def read_sequence(conf, verbose=False):
    """
    Read the contigs file for this genome and return it
    :param conf: the contigs file
    :param verbose:
    :return: a dict of contig/sequence
    """

    if verbose:
        sys.stderr.write(f"{bcolors.GREEN}READING {conf}{bcolors.ENDC}\n")
    return read_fasta(conf, whole_id=False)
Exemple #3
0
def write_permutations(faf, outputf, verbose=False):
    """
    Create and write all the permutations
    """

    fa = read_fasta(faf, whole_id=False)
    ids = list(fa.keys())

    with open(outputf, 'w') as out:
        for tple in combinations(ids, 2):
            out.write(
                f">{tple[0]}\n{fa[tple[0]]}\n>{tple[1]}\n{fa[tple[1]]}\n")
Exemple #4
0
def fasta2ids(faf, verbose=False):
    """
    Extract IDs from a fasta file
    :param faf: fasta file
    :param verbose: more output
    :return: a set of IDS
    """
    if verbose:
        sys.stderr.write(
            f"{bcolors.GREEN} Reading IDs from fasta file: {faf}{bcolors.ENDC}"
        )

    f = read_fasta(faf, whole_id=False)
    return set(f.keys())
Exemple #5
0
from roblib import read_fasta
from random import randint

__author__ = 'Rob Edwards'

parser = argparse.ArgumentParser(
    description='Convert a fasta file to fastq, faking the qual scores')
parser.add_argument('-f', help='fasta file', required=True)
parser.add_argument('-q', help='fastq output file', required=True)
parser.add_argument('-s',
                    help='quality score. Default = 40',
                    default=40,
                    type=int)
parser.add_argument('-r',
                    help='random quality scores between 5 and 40',
                    action='store_true')
args = parser.parse_args()

c = chr(args.s)

fa = read_fasta(args.f)
with open(args.q, 'w') as out:
    for i in fa:
        l = len(fa[i])
        q = l * c
        if args.r:
            q = ""
            for s in range(l):
                q = q + chr(randint(33, 125))
        out.write("@{}\n{}\n+\n{}\n".format(i, fa[i], q))
Exemple #6
0
        sys.exit(1)

    if args.f:
        files = args.f
    else:
        files = []

    if args.d:
        for subdir in args.d:
            for f in os.listdir(subdir):
                files.append(os.path.join(subdir, f))

    overall = {'number': 0, 'total': 0, 'shortest': 1e6, 'longest': 0}

    for faf in files:
        fa = read_fasta(faf)

        if len(fa.keys()) == 1 and list(fa.keys())[0] == '':
            sys.stderr.write(f"No sequences found in {faf}\n")
            sys.exit(0)

        if args.l:
            for i in fa:
                print("{}\t{}".format(i, len(fa[i])))
            print()

        lensall = [len(fa[i]) for i in fa]
        lens = list(filter(lambda x: x > args.m, lensall))
        lens.sort()
        length = sum(lens)
Exemple #7
0
import os
import sys
import argparse
from roblib import read_fasta

__author__ = "Rob Edwards"

parser = argparse.ArgumentParser(description="Convert a fasta file to fastq, faking the qual scores")
parser.add_argument("-f", help="fasta file", required=True)
parser.add_argument("-q", help="fastq output file", required=True)
parser.add_argument("-s", help="quality score. Default = 40", default=40, type=int)
args = parser.parse_args()

c = chr(args.s)

fa = read_fasta(args.f)
with open(args.q, "w") as out:
    for i in fa:
        l = len(fa[i])
        out.write("@{}\n{}\n+\n{}\n".format(i, fa[i], l * c))
import numpy
from roblib import read_fasta

__author__ = 'Rob Edwards'

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description=' ')
    parser.add_argument('-d', help='directory of fasta files', required=True, action='append')
    parser.add_argument('-p', help='figure file name for the graph', required=True)
    parser.add_argument('-m', help='minimum length to be included (default = all reads)', default=0, type=int)
    args = parser.parse_args()

    lengths = {}
    maxd = 0
    for d in args.d:
        lengths[d] = []
        for f in os.listdir(d):
            fa = read_fasta(os.path.join(d, f))
            lengths[d].extend([len(fa[x]) for x in fa])
            maxd = max(lengths[d]) if max(lengths[d]) > maxd else maxd

    bins = numpy.linspace(args.m, maxd, 100)
    alpha = 1.0 / len(args.d)

    #pyplot.ylim(ymin=args.m)
    for d in args.d:
        data = list(filter(lambda x: x > args.m, lengths[d]))
        pyplot.hist(data, bins, alpha=alpha, label=d)
    pyplot.legend(loc='upper right')
    pyplot.savefig(args.p)
    counter = args.n - 1

    if not args.f and not args.d:
        sys.stderr.write(
            f"{bcolors.RED}FATAL: Please supply either -d or -f options{bcolors.ENDC}\n"
        )
        sys.exit(-1)

    idmap = open(args.i, 'w')
    out = open(args.o, 'w')

    if args.f:
        for f in args.f:
            if args.v:
                sys.stderr.write(f"{bcolors.GREEN}Reading {f}{bcolors.ENDC}\n")
            fa = read_fasta(f)
            for id in fa:
                counter += 1
                out.write(">{}\n{}\n".format(counter, fa[id]))
                idmap.write("{}\t{}\t{}\n".format(f, id, counter))
                if args.x and (counter - (args.n - 2)) > args.x:
                    break

    if args.d:
        for d in args.d:
            if args.v:
                sys.stderr.write(f"{bcolors.GREEN}Reading {d}{bcolors.ENDC}\n")
            for f in os.listdir(d):
                if args.v:
                    sys.stderr.write(
                        f"{bcolors.BLUE}\tReading {f}{bcolors.ENDC}\n")
Exemple #10
0
    parser.add_argument('-v', help='verbose output', action='store_true')
    args = parser.parse_args()

    endings = {'.fna', '.fasta', '.fa'}

    for f in os.listdir(args.d):
        longest = [0, None, None]
        isfasta = False
        for e in endings:
            if f.endswith(e):
                isfasta = True
                break
        if not isfasta:
            if args.v:
                sys.stderr.write(
                    f"{bcolors.PINK}Don't think {f} is a fasta file. Skipped\n{bcolors.ENDC}"
                )
            continue
        if args.v:
            sys.stderr.write(f"{bcolors.GREEN}{f}{bcolors.ENDC}\n")
        fa = read_fasta(os.path.join(args.d, f))
        for x in fa:
            if len(fa[x]) > longest[0]:
                longest = [len(fa[x]), x, fa[x]]
        if 0 == longest[0]:
            continue
        print("{}\t{}".format(f, longest[0]))
        if args.f:
            with open(args.f, 'a') as out:
                out.write(f">{longest[1]} [from {f}]\n{longest[2]}\n")
"""
Print the length of the longest contig for each file in a directory of fasta files.
"""

import os
import sys
import argparse
from roblib import read_fasta

__author__ = 'Rob Edwards'

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Print the length of the longest contig for each file in a directory of fasta files')
    parser.add_argument('-d', help='Directory of fasta files', required=True)
    args = parser.parse_args()

    for f in os.listdir(args.d):
        fa = read_fasta(os.path.join(args.d, f))
        lengths = [len(fa[x]) for x in fa]
        lengths.sort()
        print("{}\t{}".format(f, lengths[-1]))


import os
import sys
import argparse

from roblib import read_fasta, write_fastq, message

__author__ = 'Rob Edwards'
__copyright__ = 'Copyright 2020, Rob Edwards'
__credits__ = ['Rob Edwards']
__license__ = 'MIT'
__maintainer__ = 'Rob Edwards'
__email__ = '*****@*****.**'

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description=" ")
    parser.add_argument('-f', help='fasta file', required=True)
    parser.add_argument('-q', help='quality file', required=True)
    parser.add_argument('-o', help='output fastq file', required=True)
    parser.add_argument('-v', help='verbose output', action='store_true')
    args = parser.parse_args()

    if not os.path.exists(args.f) and not os.path.exists(args.q):
        message("FATAL: either {args.f} or {args.q} not found", "RED")
        sys.exit(-1)

    fa = read_fasta(args.f, True, False)
    qu = read_fasta(args.q, True, True)

    write_fastq(fa, qu, args.o, args.v)