def import_fasta(fastaFile): seqDict = fol.blast_dict(fastaFile) longList = fol.one_line_d(fastaFile).keys() descDict = {} lenDict = {} for header in seqDict.keys(): lenDict[header] = len(seqDict[header]) for header in longList: parts = header.rstrip("\n").lstrip(">").split(" ") descDict[parts[0]] = " ".join(parts[1:]) return lenDict, descDict, seqDict
import Fasta_one_line as fol seqDict = fol.one_line_d("ncbi_plasmid.fna") outfile = open("Plasmid_lengths.txt", 'w') outfile.write("locus\tlength\n") for header in seqDict.keys(): locus = header.split("|")[1].split(".")[0] outfile.write("{}\t{}\n".format(locus, len(seqDict[header]))) outfile.close()
import Fasta_one_line as fol infile = open("transposase_clusters.txt", 'r') fasta = fol.blast_dict( "/work/mpesesky/Plasmids/NCBI_Plasmids/taxonAnalysis/transposases.faa") used_families = [] for line in infile: nodes = line.rstrip().split("\t") clusterName = nodes[0] if clusterName in used_families: print(clusterName) exit() outfileName = clusterName + ".faa" outfile = open(outfileName, 'w') for node in nodes[1:]: if "|" not in node: continue if node.startswith("ref"): nodeName = node else: nodeName = "ref|{}|".format(node.split("|")[1]) try: seq = fasta[nodeName] except KeyError: print(nodeName) exit() outfile.write(">{}\n{}\n".format(nodeName, seq))
from Bio import SeqIO import Fasta_one_line as Fol import argparse parser = argparse.ArgumentParser( description="Remove plasmids by accession prefix") parser.add_argument("Infile") parser.add_argument("Outfile") args = parser.parse_args() if args.Infile.endswith("fna"): outfile = open(args.Outfile, 'w') seqs = Fol.one_line_d(args.Infile) for header in seqs.keys(): if ("NC_" in header) or ("NZ_" in header): outfile.write("{}\n{}\n".format(header, seqs[header])) outfile.close() elif args.Infile.endswith("gbff"): outlist = [] records = SeqIO.parse(args.Infile, 'genbank') for record in records: if record.id.startswith("NC_") or record.id.startswith("NZ_"): outlist.append(record) SeqIO.write(outlist, args.Outfile, "genbank") else: print("Extension not recognized")
"--binSize", metavar="S", type=float, default=10, help="Set bin size (must also set max)") args = parser.parse_args() if args.binMax == 0: binBoundaries = args.bins else: binBoundaries = np.arange(args.binMin, args.binMax, args.binSize) df = pd.read_table(args.BLASTFile, sep="\t") fastaDict = fol.one_line_d(args.FastaFile) trans = import_genes(args.TransposaseFile) abres = import_genes(args.AbresFile) lenDict = seq_lens(fastaDict) shortDF = full_len_filter(lenDict, df) countDF = pd.DataFrame(shortDF.groupby('qseqid').size(), columns=['Occurrences']) countDF['qseqid'] = countDF.index geneAddedDF = add_genes(countDF, transTable=trans, abresTable=abres) #print(geneAddedDF[(geneAddedDF['Occurrences'].map(float) >= 100.0) & (geneAddedDF['Associated Genes'] != 'Transposase')])
def combine_fastas(fastaList): fastaDict = {} for fastaFile in fastaList: fastaDict.update(fol.one_line_d(fastaFile)) return fastaDict
parser.add_argument("-w", "--width", default=0, help="Fix line length at 'width' characters") parser.add_argument("-o", "--outfile", default=None, help="Output file") parser.add_argument("-n", "--name", type=str, default=None, help="Name to add to headers") parser.add_argument("-l", "--length", action='store_true', help="Print total number of nucleotides in fasta") parser.add_argument("-s", "--stats", action='store_true', help="Print some basic stats on the fasta file(s)") parser.add_argument("-t", "--trim", type=int, default=0, help="Remove seqs under given size") args = parser.parse_args() if args.fastas is None: raise NoFasta("You must input at least one fasta") if len(args.fastas) > 1: fastaDict = combine_fastas(args.fastas) else: fastaDict = fol.one_line_d(args.fastas[0]) if args.trim > 0: fastaDict = rem_short(fastaDict, args.trim) if args.name is not None: fastaDict = rename_headers(fastaDict, args.name) if args.length: seqLen = get_seq_len(fastaDict) print("Cumulative Sequence Length: {}".format(seqLen)) if args.stats: statDict = stats(fastaDict) for key in statDict.keys(): print("{}: {}".format(key, statDict[key]))
regionDict = {} for line in regionFile: fields = line.split("\t") newRegion = sfh.Region(fields[0], int(fields[1]), int(fields[2]), args.length) newRegion.numOccur = int(fields[4].rstrip("\n")) if fields[0] in regionDict.keys(): regionDict[fields[0]].append(newRegion) else: regionDict[fields[0]] = [newRegion] regionFile.close() print(len(list(regionDict.keys()))) fastaDict = fol.one_line_d(args.FASTA) for header in fastaDict.keys(): uid = header.lstrip(">").split(" ")[0] if uid in regionDict: for reg in regionDict[uid]: reg.seq = fastaDict[header][(reg.start - 1):reg.stop] outfile = open(args.outfile, 'w') for plasmid in regionDict.keys(): for reg in regionDict[plasmid]: outfile.write(">{}_{}_{}_numOccur:{}\n{}\n".format( reg.uid, reg.start, reg.stop, reg.numOccur, reg.seq)) outfile.close()
if targetAnnotation.lower() in header.lower(): retDict[seqDict[header]] = header invDict = {v: k for k, v in retDict.items()} return invDict if __name__ == "__main__": parser = argparse.ArgumentParser( description="Subset a fasta file based on header keywords") parser.add_argument("Fasta", help="Input fasta file") parser.add_argument( "Keyword", type=str, help="Word or phrase used to identify sequences to be collected.") parser.add_argument("-o", "--output", default="subset.fna", type=argparse.FileType('w'), help="Output file name") args = parser.parse_args() inDict = fol.one_line_d(args.Fasta) subDict = gather_seqs(inDict, args.Keyword) for header in subDict.keys(): args.output.write("{}\n{}\n".format(header, subDict[header])) args.output.close()