Ejemplo n.º 1
0
def import_fasta(fastaFile):
    seqDict = fol.blast_dict(fastaFile)
    longList = fol.one_line_d(fastaFile).keys()
    descDict = {}
    lenDict = {}
    for header in seqDict.keys():
        lenDict[header] = len(seqDict[header])
    for header in longList:
        parts = header.rstrip("\n").lstrip(">").split(" ")
        descDict[parts[0]] = " ".join(parts[1:])
    return lenDict, descDict, seqDict
Ejemplo n.º 2
0
import Fasta_one_line as fol

seqDict = fol.one_line_d("ncbi_plasmid.fna")
outfile = open("Plasmid_lengths.txt", 'w')
outfile.write("locus\tlength\n")

for header in seqDict.keys():
    locus = header.split("|")[1].split(".")[0]
    outfile.write("{}\t{}\n".format(locus, len(seqDict[header])))
outfile.close()
Ejemplo n.º 3
0
import Fasta_one_line as fol

infile = open("transposase_clusters.txt", 'r')
fasta = fol.blast_dict(
    "/work/mpesesky/Plasmids/NCBI_Plasmids/taxonAnalysis/transposases.faa")

used_families = []

for line in infile:
    nodes = line.rstrip().split("\t")
    clusterName = nodes[0]
    if clusterName in used_families:
        print(clusterName)
        exit()
    outfileName = clusterName + ".faa"
    outfile = open(outfileName, 'w')

    for node in nodes[1:]:
        if "|" not in node:
            continue
        if node.startswith("ref"):
            nodeName = node
        else:
            nodeName = "ref|{}|".format(node.split("|")[1])

        try:
            seq = fasta[nodeName]
        except KeyError:
            print(nodeName)
            exit()
        outfile.write(">{}\n{}\n".format(nodeName, seq))
Ejemplo n.º 4
0
from Bio import SeqIO
import Fasta_one_line as Fol
import argparse

parser = argparse.ArgumentParser(
    description="Remove plasmids by accession prefix")

parser.add_argument("Infile")
parser.add_argument("Outfile")

args = parser.parse_args()

if args.Infile.endswith("fna"):
    outfile = open(args.Outfile, 'w')
    seqs = Fol.one_line_d(args.Infile)
    for header in seqs.keys():
        if ("NC_" in header) or ("NZ_" in header):
            outfile.write("{}\n{}\n".format(header, seqs[header]))
    outfile.close()
elif args.Infile.endswith("gbff"):
    outlist = []
    records = SeqIO.parse(args.Infile, 'genbank')
    for record in records:
        if record.id.startswith("NC_") or record.id.startswith("NZ_"):
            outlist.append(record)
    SeqIO.write(outlist, args.Outfile, "genbank")
else:
    print("Extension not recognized")
Ejemplo n.º 5
0
                    "--binSize",
                    metavar="S",
                    type=float,
                    default=10,
                    help="Set bin size (must also set max)")

args = parser.parse_args()

if args.binMax == 0:
    binBoundaries = args.bins
else:
    binBoundaries = np.arange(args.binMin, args.binMax, args.binSize)

df = pd.read_table(args.BLASTFile, sep="\t")

fastaDict = fol.one_line_d(args.FastaFile)

trans = import_genes(args.TransposaseFile)

abres = import_genes(args.AbresFile)

lenDict = seq_lens(fastaDict)

shortDF = full_len_filter(lenDict, df)

countDF = pd.DataFrame(shortDF.groupby('qseqid').size(),
                       columns=['Occurrences'])
countDF['qseqid'] = countDF.index
geneAddedDF = add_genes(countDF, transTable=trans, abresTable=abres)
#print(geneAddedDF[(geneAddedDF['Occurrences'].map(float) >= 100.0) & (geneAddedDF['Associated Genes'] != 'Transposase')])
Ejemplo n.º 6
0
def combine_fastas(fastaList):
    fastaDict = {}
    for fastaFile in fastaList:
        fastaDict.update(fol.one_line_d(fastaFile))
    return fastaDict
Ejemplo n.º 7
0
    parser.add_argument("-w", "--width", default=0, help="Fix line length at 'width' characters")
    parser.add_argument("-o", "--outfile", default=None, help="Output file")
    parser.add_argument("-n", "--name", type=str, default=None, help="Name to add to headers")
    parser.add_argument("-l", "--length", action='store_true', help="Print total number of nucleotides in fasta")
    parser.add_argument("-s", "--stats", action='store_true', help="Print some basic stats on the fasta file(s)")
    parser.add_argument("-t", "--trim", type=int, default=0, help="Remove seqs under given size")

    args = parser.parse_args()

    if args.fastas is None:
        raise NoFasta("You must input at least one fasta")

    if len(args.fastas) > 1:
        fastaDict = combine_fastas(args.fastas)
    else:
        fastaDict = fol.one_line_d(args.fastas[0])

    if args.trim > 0:
        fastaDict = rem_short(fastaDict, args.trim)

    if args.name is not None:
        fastaDict = rename_headers(fastaDict, args.name)

    if args.length:
        seqLen = get_seq_len(fastaDict)
        print("Cumulative Sequence Length: {}".format(seqLen))

    if args.stats:
        statDict = stats(fastaDict)
        for key in statDict.keys():
            print("{}: {}".format(key, statDict[key]))
Ejemplo n.º 8
0
regionDict = {}

for line in regionFile:
    fields = line.split("\t")
    newRegion = sfh.Region(fields[0], int(fields[1]), int(fields[2]),
                           args.length)
    newRegion.numOccur = int(fields[4].rstrip("\n"))
    if fields[0] in regionDict.keys():
        regionDict[fields[0]].append(newRegion)
    else:
        regionDict[fields[0]] = [newRegion]
regionFile.close()

print(len(list(regionDict.keys())))

fastaDict = fol.one_line_d(args.FASTA)

for header in fastaDict.keys():
    uid = header.lstrip(">").split(" ")[0]
    if uid in regionDict:
        for reg in regionDict[uid]:
            reg.seq = fastaDict[header][(reg.start - 1):reg.stop]

outfile = open(args.outfile, 'w')

for plasmid in regionDict.keys():
    for reg in regionDict[plasmid]:
        outfile.write(">{}_{}_{}_numOccur:{}\n{}\n".format(
            reg.uid, reg.start, reg.stop, reg.numOccur, reg.seq))
outfile.close()
        if targetAnnotation.lower() in header.lower():
            retDict[seqDict[header]] = header
    invDict = {v: k for k, v in retDict.items()}

    return invDict


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Subset a fasta file based on header keywords")

    parser.add_argument("Fasta", help="Input fasta file")
    parser.add_argument(
        "Keyword",
        type=str,
        help="Word or phrase used to identify sequences to be collected.")
    parser.add_argument("-o",
                        "--output",
                        default="subset.fna",
                        type=argparse.FileType('w'),
                        help="Output file name")

    args = parser.parse_args()

    inDict = fol.one_line_d(args.Fasta)
    subDict = gather_seqs(inDict, args.Keyword)

    for header in subDict.keys():
        args.output.write("{}\n{}\n".format(header, subDict[header]))
    args.output.close()