def parse_refs(bibtexf, verbose=False): """ Parse the references and return some data structure :param bibtexf: the bibtex file :param verbose: more output :return: the BibliographyData object and a dictionary linking lower case titles to entry keys """ if verbose: message(f"Parsing {bibtexf}", "GREEN") bib = parse_file(bibtexf, 'bibtex') titles = {} for e in bib.entries: try: if 'title' in bib.entries[e].fields: # sys.stderr.write(f"{bcolors.BLUE}{bib.entries[e].fields['title'].lower()}{bcolors.ENDC}\n") t = bib.entries[e].fields['title'].lower() t = t.replace('{', '') t = t.replace('}', '') titles[t.lower()] = e except Exception as ex: sys.stderr.write(f"Error parsing entry: {e}\n") print(ex) if verbose: message(f"Found {len(titles)} references", "BLUE") return bib, titles
def check_dups(bibtexf, verbose=False): """ Check for all duplicate entries at once. :param bibtexf: the bibtex file :param verbose: more output :return: """ if verbose: message(f"Checking for duplicate entries: {bibtexf}", "PINK") entries = set() dupentries = False with open(bibtexf, 'r') as bin: for l in bin: if l.startswith('@'): l = l.replace('@misc', '') l = l.replace('@article', '') l = l.replace('@inproceedings', '') if l in entries: sys.stderr.write("Duplicate entry " + l.replace('{', '').replace(',', '')) dupentries = True entries.add(l) if dupentries: sys.stderr.write( "FATAL: The bibtex file has duplicate entries in it. Please remove them before trying to continue\n" ) sys.stderr.write( "(It is an issue with Google Scholar, but pybtex breaks with duplicate entries. Sorry)\n" ) sys.exit(-1)
def count_feats(gbkf, verbose=False): if verbose: message(f"Reading {gbkf}", "BLUE") count = {} for seq in genbank_seqio(gbkf): for feat in seq.features: count[feat.type] = count.get(feat.type, 0) + 1 return count
def crassphage_coverage(f, verbose=False): """ Get the crassphage coverage. This is coverage.txt :param f: coverage.txt :param verbose: more output :return: """ coverage = {} if verbose: message(f"Reading {f}", "GREEN") with open(f, 'r') as fin: for l in fin: p = l.strip().split("\t") coverage[p[0]] = int(p[1]) / 97092 return coverage
def write_file(definition, samples, counts, allkeys, file, verbose=False): """ Write the appropriate output files""" allmeasures = sorted(list(allkeys)) if verbose: message(f"Writing to {file}", "GREEN") with open(file, 'w') as out: out.write("Definition\tMeasure\t") out.write("\t".join(sortedsamples)) out.write("\n") for m in allmeasures: out.write(f"{definition}\t{m}") for k in sortedsamples: if k in counts and m in counts[k]: out.write(f"\t{counts[k][m]}") else: out.write("\t0") out.write("\n")
def focus_counts(data_directory, taxlevel, verbose=False): """ find the focus output and read it""" count = {} allfocus = set() for sample in os.listdir(data_directory): if verbose: message(f"Focus: {sample}", "BLUE") count[sample] = {} if os.path.exists( os.path.join(data_directory, sample, "focus", "output_All_levels.csv")): with open( os.path.join(data_directory, sample, "focus", "output_All_levels.csv"), 'r') as fin: lastcol = -1 for l in fin: if l.startswith('Kingdom'): if '_pass.fasta' in l and '_pass_1.fasta' in l and '_pass_2.fasta' in l: lastcol = -3 elif '_pass_1.fasta' in l and '_pass_2.fasta' in l: lastcol = -2 continue l = l.strip() taxparts = l.split(",")[0:lastcol] if len(taxparts) != 8: message( f"Error parsing {sample} when lastcol was {lastcol}", "RED") message(f"{l}", "BLUE") message(f"{taxparts}", "PINK") message(f"{l.split(',')}", "GREEN") sys.exit() # note that even if we split the tax to the previous column # we use R2 for the reads then it is consistent with the sf output :) tax = ":".join(taxparts[0:taxlevel]) count[sample][tax] = count[sample].get(tax, 0) + float( l.split(",")[-1]) allfocus.add(tax) return count, allfocus
def superfocus_counts(data_directory, level=3, verbose=False): """ find the superfocus output and read it. The file name loooks like data/DRR042358/sf/DRR042358all_levels_and function.xls :param data_directory: data/ :param level: the ss level. Currently only 1, 2, and 3 are supported. 2 is 1+2 :param verbose: more output :return: """ count = {} allsslvl = set() for sample in os.listdir(data_directory): if verbose: message(f"Super focus: {sample}", "YELLOW") count[sample] = {} sffile = os.path.join(data_directory, sample, "sf", f"{sample}all_levels_and_function.xls") if os.path.exists(sffile): keep = False with open(sffile, 'r') as fin: for l in fin: if l.startswith('Subsystem Level 1'): keep = True continue if not keep: continue p = l.strip().split("\t") if level == 1: sslvl = p[0] elif level == 2: sslvl = ":".join([p[0], p[1]]) else: sslvl = p[2] count[sample][sslvl] = count[sample].get(sslvl, 0) + float( p[-1]) allsslvl.add(sslvl) return count, allsslvl
def abricate_counts(data_directory, verbose=False): """ find the abricate folders and read them """ count = {} allabr = set() for sample in os.listdir(data_directory): if verbose: message(f"Abricate: {sample}", "PINK") count[sample] = {} if os.path.exists(os.path.join(data_directory, sample, "abricate")): for f in os.listdir( os.path.join(data_directory, sample, "abricate")): if f.endswith('.tab'): with open( os.path.join(data_directory, sample, "abricate", f), 'r') as fin: for l in fin: p = l.strip().split("\t") abr = f"{p[11]}:{p[5]}" count[sample][abr] = count[sample].get(abr, 0) + 1 allabr.add(abr) return count, allabr
def run_phage_boost(genecalls, model_file, verbose): """ Run phage boost :param model_file: The model file that is probably something like model_delta_std_hacked.pickled.silent.gz :param genecalls: The pandas data frame of gene calls :param verbose: more output :return: """ # rolling params period = 20 win_type = 'parzen' min_periods = 1 # region finding params threshold = 0.9 length = 10 gaps = 5 neighbouring = 0 alpha = 0.001 # calculate features from gene calls if verbose: message("Calculating features", "GREEN") df = calculate_features(genecalls) # load model model, feats, feats_, limit = read_model_from_file(model_file) # transform data df = get_predictions.get_deltas(df[feats_]) if verbose: message("Transforming gene predictions to regions", "GREEN") # transform single gene predictions to regions newgenecalls, nphages, res = predict(model, genecalls, df, feats, period, win_type, min_periods, limit, threshold, length, gaps, neighbouring, alpha) return res
type=int, default=7) parser.add_argument('-s', help='subsystem level (1,2, or 3) (default = 3)', type=int, default=3) parser.add_argument('-v', help='verbose output', action='store_true') parser.add_argument( '-a', help= 'Run all focus and superfocus levels. This is not coded efficiently, so use sparingly!', action='store_true') args = parser.parse_args() if args.s < 1 or args.s > 3: message(f"Error: No subsystem level {args.s}. Defaulting to 3", "RED") args.s = 3 if args.f not in focustax: message( f"{args.f} is not valid for focus taxonomy. It must be an integer between 1 and 8 in {focustax}", "RED") sys.exit() coverage = crassphage_coverage(args.c, args.v) focus, allfocus = focus_counts(args.d, args.f, args.v) abricate, allabricate = abricate_counts(args.d, args.v) sf, allsf = superfocus_counts(args.d, args.s, args.v) # now get all the samples that are in abricate, focus, or sf
parser.add_argument( '--maxrev', type=int, default=1e6, help= 'do not trim more than these bp from the end (does not include primer length)' ) parser.add_argument('--listall', help='list all sequences that were trimmed', action='store_true') parser.add_argument('-v', help='verbose output', action='store_true') args = parser.parse_args() if not args.forward and not args.reverse: message( "Either --forward or --reverse primer must be specified otherwise nothing will be removed" ) sys.exit(-1) fwd = None rev = None if args.forward: fwd = args.forward.upper() if args.reverse: rev = args.reverse.upper() with open(args.o, 'w') as out: for sid, seqid, seq, qual in stream_fastq(args.f): original = [seq, qual] trimmed = False if fwd and fwd in seq.upper():
Test a directory of genbank files and note whether they have the is_phage qualifier for their genomes """ import os import sys import argparse from roblib import genbank_seqio, message __author__ = 'Rob Edwards' __copyright__ = 'Copyright 2020, Rob Edwards' __credits__ = ['Rob Edwards'] __license__ = 'MIT' __maintainer__ = 'Rob Edwards' __email__ = '*****@*****.**' if __name__ == '__main__': parser = argparse.ArgumentParser(description=" ") parser.add_argument('-d', help='directory of genbank files', required=True) parser.add_argument('-v', help='verbose output', action='store_true') args = parser.parse_args() for f in os.listdir(args.d): if args.v: message(f"Reading {f}", "GREEN") pc = 0 for s in genbank_seqio(os.path.join(args.d, f)): for feat in s.features: if 'is_phage' in feat.qualifiers: pc += 1 print(f"{f}\t{pc}")
__license__ = 'MIT' __maintainer__ = 'Rob Edwards' __email__ = '*****@*****.**' if __name__ == '__main__': parser = argparse.ArgumentParser( description="Randomly sample a single fastq file") parser.add_argument('-f', help='fastq file to sample', required=True) parser.add_argument('-o', help='output file name', required=True) parser.add_argument('-p', help='percent of the file to sample', required=True, type=int) parser.add_argument('-v', help='verbose output', action='store_true') args = parser.parse_args() sequences = [] for seqid, header, seq, qualscores in stream_fastq(args.f): sequences.append([header, seq, qualscores]) n = int(args.p / 100 * len(sequences)) if args.v: message( f"There are {len(sequences)} sequences. So we will sample {n} elements", "GREEN") with open(args.o, 'w') as out: for s in sample(sequences, n): out.write(f"@{s[0]}\n{s[1]}\n+\n{s[2]}\n")
import os import sys import argparse from roblib import read_fasta, write_fastq, message __author__ = 'Rob Edwards' __copyright__ = 'Copyright 2020, Rob Edwards' __credits__ = ['Rob Edwards'] __license__ = 'MIT' __maintainer__ = 'Rob Edwards' __email__ = '*****@*****.**' if __name__ == '__main__': parser = argparse.ArgumentParser(description=" ") parser.add_argument('-f', help='fasta file', required=True) parser.add_argument('-q', help='quality file', required=True) parser.add_argument('-o', help='output fastq file', required=True) parser.add_argument('-v', help='verbose output', action='store_true') args = parser.parse_args() if not os.path.exists(args.f) and not os.path.exists(args.q): message("FATAL: either {args.f} or {args.q} not found", "RED") sys.exit(-1) fa = read_fasta(args.f, True, False) qu = read_fasta(args.q, True, True) write_fastq(fa, qu, args.o, args.v)
os.makedirs(args.o, exist_ok=True) dna = {} qual = {} header = {} # initially didn't plan to keep all these :) for seqid, hd, seq, qualscores in stream_fastq(args.f): dna[seqid] = seq.upper() qual[seqid] = qualscores header[seqid] = hd changed = set() deleted = set() for step in range(1, 10): if args.v: message(f"Working on step {step}", "GREEN") fqf = os.path.join(args.q, f"step_{step}", f"{args.n}.s{step}.out.fastq") if not os.path.exists(fqf): message(f"FQ File {fqf} not found", "RED") continue seqs = [] with open(os.path.join(args.o, f"step_{step}.text"), 'w') as out, \ open(os.path.join(args.o, f"step_{step}_input.fq"), 'w') as fqinput,\ open(os.path.join(args.o, f"step_{step}_output.fq"), 'w') as fqout: seen = set() for seqid, hd, seq, qualscores in stream_fastq(fqf): seen.add(seqid) if seqid not in dna: message(f"{seqid} is a different sequence id", "PINK") continue
parser.add_argument('-f', help='genbank file') parser.add_argument('-d', help='directory of genbank files') parser.add_argument('-t', help='feature type(s) (at least one must be provided)', nargs="+") parser.add_argument('-v', help='verbose output', action='store_true') args = parser.parse_args() files = [] if args.f: files.append(args.f) if args.d: for f in os.listdir(args.d): files.append(os.path.join(args.d, f)) if len(files) == 0: message("Fatal. Either -d or -f is required", "RED") if len(args.t) == 0: message("Fatal. Please provide at least one feature type to count", "RED") print("File", end="") for t in args.t: print(f"\t{t}", end="") print() for f in files: c = count_feats(f, args.v) print(f, end="") for t in args.t: if t in c: print(f"\t{c[t]}", end="")
required=True, help= "Model file. Probably something like model_delta_std_hacked.pickled.silent.gz" ) parser.add_argument('-o', '--outputfile', help='output file for phage regions') parser.add_argument('-c', '--mincontiglen', default=1000, type=int, help='minimum contig length [Default: %(default)d]') parser.add_argument('-v', '--verbose', help='verbose output', action='store_true') args = parser.parse_args() if args.verbose: message("Reading genbank file", "GREEN") genecalls = genbank_to_pandas(args.genbankfile, args.mincontiglen, True, True, args.verbose) if args.verbose: message("Phage Boosting", "GREEN") res = run_phage_boost(genecalls, args.modelfile, args.verbose) if args.outputfile: with open(args.outputfile, 'w') as out: res.to_csv(out, sep="\t", header=True) else: print(res)