def merge(args): """ %prog merge folder1 ... Consolidate split contents in the folders. The folders can be generated by the split() process and several samples may be in separate fastq files. This program merges them. """ p = OptionParser(merge.__doc__) p.add_option("--outdir", default="outdir", help="Output final reads in [default: %default]") opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) folders = args outdir = opts.outdir mkdir(outdir) files = flatten(glob("{0}/*.*.fastq".format(x)) for x in folders) files = list(files) key = lambda x: op.basename(x).split(".")[0] files.sort(key=key) for id, fns in groupby(files, key=key): fns = list(fns) outfile = op.join(outdir, "{0}.fastq".format(id)) FileMerger(fns, outfile=outfile).merge(checkexists=True)
def merge(args): """ %prog merge folder1 ... Consolidate split contents in the folders. The folders can be generated by the split() process and several samples may be in separate fastq files. This program merges them. """ p = OptionParser(merge.__doc__) p.set_outdir(outdir="outdir") opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) folders = args outdir = opts.outdir mkdir(outdir) files = flatten(glob("{0}/*.*.fastq".format(x)) for x in folders) files = list(files) key = lambda x: op.basename(x).split(".")[0] files.sort(key=key) for id, fns in groupby(files, key=key): fns = list(fns) outfile = op.join(outdir, "{0}.fastq".format(id)) FileMerger(fns, outfile=outfile).merge(checkexists=True)
def notify(args): """ %prog notify "Message to be sent" Send a message via email/push notification. Email notify: Recipient email address is constructed by joining the login `username` and `dnsdomainname` of the server Push notify: Uses available API """ from jcvi.utils.iter import flatten valid_notif_methods.extend(available_push_api.keys()) fromaddr, toaddr = get_email_address() p = OptionParser(notify.__doc__) p.add_option("--method", default="email", choices=valid_notif_methods, help="Specify the mode of notification [default: %default]") p.add_option("--subject", default="JCVI: job monitor", help="Specify the subject of the notification message") p.set_email() g1 = OptionGroup(p, "Optional `push` parameters") g1.add_option("--api", default="pushover", \ choices=list(flatten(available_push_api.values())), help="Specify API used to send the push notification") g1.add_option("--priority", default=0, type="int", help="Message priority (-1 <= p <= 2) [default: %default]") g1.add_option("--timestamp", default=None, type="int", \ dest="timestamp", \ help="Message timestamp in unix format [default: %default]") p.add_option_group(g1) opts, args = p.parse_args(args) if len(args) == 0: logging.error("Please provide a brief message to be sent") sys.exit(not p.print_help()) subject = opts.subject message = " ".join(args).strip() if opts.method == "email": if not is_valid_email(opts.toaddr): logging.debug("Email address `{0}` is not valid!".format( opts.toaddr)) sys.exit() toaddr = [opts.toaddr] # TO address should be in a list send_email(fromaddr, toaddr, subject, message) else: pushnotify(subject, message, api=opts.api, priority=opts.priority, \ timestamp=opts.timestamp)
def notify(args): """ %prog notify "Message to be sent" Send a message via email/push notification. Email notify: Recipient email address is constructed by joining the login `username` and `dnsdomainname` of the server Push notify: Uses available API """ from jcvi.utils.iter import flatten debug() valid_notif_methods.extend(available_push_api.keys()) fromaddr, toaddr = get_email_address() p = OptionParser(notify.__doc__) p.add_option("--method", default="email", choices=valid_notif_methods, help="Specify the mode of notification [default: %default]") p.add_option("--subject", default="JCVI: job monitor", help="Specify the subject of the notification message") p.set_email() g1 = OptionGroup(p, "Optional `push` parameters") g1.add_option("--api", default="pushover", \ choices=list(flatten(available_push_api.values())), help="Specify API used to send the push notification" + \ " [default: %default]") g1.add_option("--priority", default=0, type="int", help="Message priority (-1 <= p <= 2) [default: %default]") g1.add_option("--timestamp", default=None, type="int", \ dest="timestamp", \ help="Message timestamp in unix format [default: %default]") p.add_option_group(g1) opts, args = p.parse_args(args) if len(args) == 0: logging.error("Please provide a brief message to be sent") sys.exit(not p.print_help()) subject = opts.subject message = " ".join(args).strip() if opts.method == "email": if not is_valid_email(opts.toaddr): logging.debug("Email address `{0}` is not valid!".format(opts.toaddr)) sys.exit() toaddr = [opts.toaddr] # TO address should be in a list send_email(fromaddr, toaddr, subject, message) else: pushnotify(subject, message, api=opts.api, priority=opts.priority, \ timestamp=opts.timestamp)
def join(args): """ %prog join file1.txt file2.txt .. Join tabular files based on common column. --column specifies the column index to pivot on. Use comma to separate multiple values if the pivot column is different in each file. Maintain the order in the first file. """ from jcvi.utils.iter import flatten p = OptionParser(join.__doc__) p.add_option("--column", default="0", help="0-based column id, multiple values allowed [default: %default]") p.add_option("--noheader", default=False, action="store_true", help="Do not print header [default: %default]") set_outfile(p) opts, args = p.parse_args(args) nargs = len(args) if len(args) < 2: sys.exit(not p.print_help()) c = opts.column if "," in c: cc = [int(x) for x in c.split(",")] else: cc = [int(c)] * nargs assert len(cc) == nargs, "Column index number != File number" # Maintain the first file line order, and combine other files into it pivotfile = args[0] files = [DictFile(f, keypos=c, valuepos=None, delimiter="\t") \ for f, c in zip(args, cc)] otherfiles = files[1:] header = "\t".join(flatten([op.basename(x.filename)] * x.ncols \ for x in files)) fp = open(pivotfile) fw = must_open(opts.outfile, "w") if not opts.noheader: print >> fw, header for row in fp: row = row.rstrip() atoms = row.split("\t") newrow = atoms key = atoms[cc[0]] for d in otherfiles: drow = d.get(key, ["na"] * d.ncols) newrow += drow print >> fw, "\t".join(newrow)
def fix_orientation(self, tour): """ Test each scaffold if flipping will increass longest monotonic chain length. """ orientations = dict(tour) # old configuration here scaffold_oo = defaultdict(list) scaffolds, oos = zip(*tour) for mlg in self.linkage_groups: lg = mlg.lg mapname = mlg.mapname for s, o in tour: i = scaffolds.index(s) L = [self.get_series(lg, x, xo) for x, xo in tour[:i]] U = [self.get_series(lg, x, xo) for x, xo in tour[i + 1:]] L, U = list(flatten(L)), list(flatten(U)) M = self.get_series(lg, s) plus = lms(L + M + U) minus = lms(L + M[::-1] + U) d = plus[0] - minus[0] if not d: continue scaffold_oo[s].append((d, mapname)) # reset orientation fixed = 0 for s, v in scaffold_oo.items(): d = self.weighted_mean(v) old_d = orientations[s] new_d = np.sign(d) if new_d != old_d: orientations[s] = new_d fixed += 1 tour = [(x, orientations[x]) for x in scaffolds] logging.debug("Fixed orientations for {0} scaffolds.".format(fixed)) return tour
def make_attributes(s, gff3=True): """ In GFF3, the last column is typically: ID=cds00002;Parent=mRNA00002; In GFF2, the last column is typically: Gene 22240.t000374; Note "Carbonic anhydrase" """ if gff3: d = parse_qs(s) else: attributes = s.split("; ") d = DefaultOrderedDict(list) for a in attributes: key, val = a.strip().split(' ', 1) val = val.replace('"', '') d[key].append(val) for key, val in d.items(): d[key] = list(flatten([v.split(",") for v in val])) return d
def waitpid(args): """ %prog waitpid PID ::: "./command_to_run param1 param2 ...." Given a PID, this script will wait for the PID to finish running and then perform a desired action (notify user and/or execute a new command) Specify "--notify=METHOD` to send the user a notification after waiting for PID Specify `--grid` option to send the new process to the grid after waiting for PID """ import shlex from jcvi.utils.iter import flatten valid_notif_methods.extend(list(flatten(available_push_api.values()))) p = OptionParser(waitpid.__doc__) p.add_option("--notify", default="email", choices=valid_notif_methods, help="Specify type of notification to be sent after waiting") p.add_option("--interval", default=120, type="int", help="Specify PID polling interval in seconds") p.add_option("--message", help="Specify notification message [default: %default]") p.set_email() p.set_grid() opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) if not opts.message: """ If notification message not specified by user, just get the name of the running command and use it as the message """ from subprocess import check_output sep = ":::" cmd = None if sep in args: sepidx = args.index(sep) cmd = " ".join(args[sepidx + 1:]).strip() args = args[:sepidx] pid = int(" ".join(args).strip()) status = pid_exists(pid) if status: if opts.message: msg = opts.message else: get_origcmd = "ps -p {0} -o cmd h".format(pid) msg = check_output(shlex.split(get_origcmd)).strip() _waitpid(pid, interval=opts.interval) else: logging.debug("Process with PID {0} does not exist".format(pid)) sys.exit() if opts.notify: notifycmd = ["[{0}] `{1}`".format(gethostname(), msg)] if opts.notify != "email": notifycmd.append("--method={0}".format("push")) notifycmd.append("--api={0}".format(opts.notify)) else: notifycmd.append('--email={0}'.format(opts.email)) notify(notifycmd) if cmd is not None: bg = False if opts.grid else True sh(cmd, grid=opts.grid, background=bg)
def join(args): """ %prog join file1.txt(pivotfile) file2.txt .. Join tabular-like files based on common column. --column specifies the column index to pivot on. Use comma to separate multiple values if the pivot column is different in each file. Maintain the order in the first file. --sep specifies the column separators, default to tab. Use comma to separate multiple values if the column separator is different in each file. """ from jcvi.utils.iter import flatten p = OptionParser(join.__doc__) p.add_option("--column", default="0", help="0-based column id, multiple values allowed [default: %default]") p.set_sep(multiple=True) p.add_option("--noheader", default=False, action="store_true", help="Do not print header [default: %default]") p.add_option("--na", default="na", help="Value for unjoined data [default: %default]") p.add_option("--keysep", default=",", help="specify separator joining multiple elements in the key column" + " of the pivot file [default: %default]") p.set_outfile() opts, args = p.parse_args(args) nargs = len(args) keysep = opts.keysep if len(args) < 2: sys.exit(not p.print_help()) na = opts.na c = opts.column if "," in c: cc = [int(x) for x in c.split(",")] else: cc = [int(c)] * nargs assert len(cc) == nargs, "Column index number != File number" s = opts.sep if "," in s: ss = [x for x in s.split(",")] else: ss = [s] * nargs assert len(ss) == nargs, "column separator number != File number" # Maintain the first file line order, and combine other files into it pivotfile = args[0] files = [DictFile(f, keypos=c, valuepos=None, delimiter=s) \ for f, c, s in zip(args, cc, ss)] otherfiles = files[1:] header = "\t".join(flatten([op.basename(x.filename)] * x.ncols \ for x in files)) fp = must_open(pivotfile) fw = must_open(opts.outfile, "w") if not opts.noheader: print >> fw, header for row in fp: row = row.rstrip() atoms = row.split(ss[0]) newrow = atoms key = atoms[cc[0]] keys = key.split(keysep) if keysep in key else [key] for d in otherfiles: drows = list() for key in keys: drows.append(d.get(key, [na] * d.ncols)) drow = [keysep.join(x) for x in list(zip(*drows))] newrow += drow print >> fw, "\t".join(newrow)
def waitpid(args): """ %prog waitpid PID ::: "./command_to_run param1 param2 ...." Given a PID, this script will wait for the PID to finish running and then perform a desired action (notify user and/or execute a new command) Specify "--notify=METHOD` to send the user a notification after waiting for PID Specify `--grid` option to send the new process to the grid after waiting for PID """ import shlex from time import sleep from jcvi.utils.iter import flatten valid_notif_methods.extend(list(flatten(available_push_api.values()))) p = OptionParser(waitpid.__doc__) p.add_option("--notify", default=None, choices=valid_notif_methods, help="Specify type of notification to be sent after waiting") p.add_option("--interval", default=120, type="int", help="Specify PID polling interval in seconds") p.add_option("--message", help="Specify notification message [default: %default]") p.set_email() p.set_grid() opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) if not opts.message: """ If notification message not specified by user, just get the name of the running command and use it as the message """ from subprocess import check_output sep = ":::" cmd = None if sep in args: sepidx = args.index(sep) cmd = " ".join(args[sepidx + 1:]).strip() args = args[:sepidx] pid = int(" ".join(args).strip()) status = is_running(pid) if status: if opts.message: msg = opts.message else: get_origcmd = "ps -p {0} -o cmd h".format(pid) msg = check_output(shlex.split(get_origcmd)).strip() while is_running(pid): sleep(opts.interval) else: logging.debug("Process with PID {0} does not exist".format(pid)) sys.exit() if opts.notify: notifycmd = ["[completed] {0}: `{1}`".format(gethostname(), msg)] if opts.notify != "email": notifycmd.append("--method={0}".format("push")) notifycmd.append("--api={0}".format(opts.notify)) else: notifycmd.append('--email="{0}"'.format(opts.email)) notify(notifycmd) if cmd is not None: bg = False if opts.grid else True sh(cmd, grid=opts.grid, background=bg)
def mask(args): """ %prog mask agpfile bedfile Mask given ranges in componets to gaps. """ p = OptionParser(mask.__doc__) p.add_option("--split", default=False, action="store_true", help="Split object and create new names [default: %default]") p.add_option("--log", default=False, action="store_true", help="Write verbose logs to .masklog file [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(p.print_help()) agpfile, bedfile = args agp = AGP(agpfile) bed = Bed(bedfile) simple_agp = agp.order # agp lines to replace original ones, keyed by the component agp_fixes = defaultdict(list) newagpfile = agpfile.replace(".agp", ".masked.agp") logfile = bedfile.replace(".bed", ".masklog") fw = open(newagpfile, "w") if opts.log: fwlog = open(logfile, "w") for component, intervals in bed.sub_beds(): if opts.log: print >> fwlog, "\n".join(str(x) for x in intervals) i, a = simple_agp[component] object = a.object component_span = a.component_span orientation = a.orientation if opts.log: print >> fwlog, a assert a.component_beg, a.component_end arange = a.component_beg, a.component_end # Make sure `ivs` contain DISJOINT ranges, and located within `arange` ivs = [] for i in intervals: iv = range_intersect(arange, (i.start, i.end)) if iv is not None: ivs.append(iv) # Sort the ends of `ivs` as well as the arange arange = a.component_beg - 1, a.component_end + 1 endpoints = sorted(flatten(ivs + [arange])) # reverse if component on negative strand if orientation == '-': endpoints.reverse() sum_of_spans = 0 # assign complements as sequence components for i, (a, b) in enumerate(pairwise(endpoints)): if orientation == '-': a, b = b, a if orientation not in ('+', '-'): orientation = '+' oid = object + "_{0}".format(i / 2) if opts.split else object aline = [oid, 0, 0, 0] if i % 2 == 0: cspan = b - a - 1 aline += ['D', component, a + 1, b - 1, orientation] is_gap = False else: cspan = b - a + 1 aline += ["N", cspan, "fragment", "yes"] is_gap = True if cspan <= 0: continue sum_of_spans += cspan aline = "\t".join(str(x) for x in aline) if not (opts.split and is_gap): agp_fixes[component].append(aline) if opts.log: print >> fwlog, aline assert component_span == sum_of_spans if opts.log: print >> fwlog # Finally write the masked agp for a in agp: if not a.is_gap and a.component_id in agp_fixes: print >> fw, "\n".join(agp_fixes[a.component_id]) else: print >> fw, a fw.close() # Reindex idxagpfile = reindex([newagpfile]) shutil.move(idxagpfile, newagpfile) return newagpfile
def join(args): """ %prog join file1.txt(pivotfile) file2.txt .. Join tabular-like files based on common column. --column specifies the column index to pivot on. Use comma to separate multiple values if the pivot column is different in each file. Maintain the order in the first file. --sep specifies the column separators, default to tab. Use comma to separate multiple values if the column separator is different in each file. """ from jcvi.utils.iter import flatten p = OptionParser(join.__doc__) p.add_option( "--column", default="0", help="0-based column id, multiple values allowed [default: %default]") p.set_sep(multiple=True) p.add_option("--noheader", default=False, action="store_true", help="Do not print header [default: %default]") p.add_option("--na", default="na", help="Value for unjoined data [default: %default]") p.add_option( "--keysep", default=",", help="specify separator joining multiple elements in the key column" + " of the pivot file [default: %default]") p.set_outfile() opts, args = p.parse_args(args) nargs = len(args) keysep = opts.keysep if len(args) < 2: sys.exit(not p.print_help()) na = opts.na c = opts.column if "," in c: cc = [int(x) for x in c.split(",")] else: cc = [int(c)] * nargs assert len(cc) == nargs, "Column index number != File number" s = opts.sep if "," in s: ss = [x for x in s.split(",")] else: ss = [s] * nargs assert len(ss) == nargs, "column separator number != File number" # Maintain the first file line order, and combine other files into it pivotfile = args[0] files = [DictFile(f, keypos=c, valuepos=None, delimiter=s) \ for f, c, s in zip(args, cc, ss)] otherfiles = files[1:] header = "\t".join(flatten([op.basename(x.filename)] * x.ncols \ for x in files)) fp = must_open(pivotfile) fw = must_open(opts.outfile, "w") if not opts.noheader: print >> fw, header for row in fp: row = row.rstrip() atoms = row.split(ss[0]) newrow = atoms key = atoms[cc[0]] keys = key.split(keysep) if keysep in key else [key] for d in otherfiles: drows = list() for key in keys: drows.append(d.get(key, [na] * d.ncols)) drow = [keysep.join(x) for x in list(zip(*drows))] newrow += drow print >> fw, "\t".join(newrow)
def mask(args): """ %prog mask agpfile bedfile Mask given ranges in components to gaps. """ p = OptionParser(mask.__doc__) p.add_option("--split", default=False, action="store_true", help="Split object and create new names [default: %default]") p.add_option( "--log", default=False, action="store_true", help="Write verbose logs to .masklog file [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(p.print_help()) agpfile, bedfile = args agp = AGP(agpfile) bed = Bed(bedfile) simple_agp = agp.order # agp lines to replace original ones, keyed by the component agp_fixes = defaultdict(list) newagpfile = agpfile.replace(".agp", ".masked.agp") logfile = bedfile.replace(".bed", ".masklog") fw = open(newagpfile, "w") if opts.log: fwlog = open(logfile, "w") for component, intervals in bed.sub_beds(): if opts.log: print >> fwlog, "\n".join(str(x) for x in intervals) i, a = simple_agp[component] object = a.object component_span = a.component_span orientation = a.orientation if opts.log: print >> fwlog, a assert a.component_beg, a.component_end arange = a.component_beg, a.component_end # Make sure `ivs` contain DISJOINT ranges, and located within `arange` ivs = [] for i in intervals: iv = range_intersect(arange, (i.start, i.end)) if iv is not None: ivs.append(iv) # Sort the ends of `ivs` as well as the arange arange = a.component_beg - 1, a.component_end + 1 endpoints = sorted(flatten(ivs + [arange])) # reverse if component on negative strand if orientation == '-': endpoints.reverse() sum_of_spans = 0 # assign complements as sequence components for i, (a, b) in enumerate(pairwise(endpoints)): if orientation == '-': a, b = b, a if orientation not in ('+', '-'): orientation = '+' oid = object + "_{0}".format(i / 2) if opts.split else object aline = [oid, 0, 0, 0] if i % 2 == 0: cspan = b - a - 1 aline += ['D', component, a + 1, b - 1, orientation] is_gap = False else: cspan = b - a + 1 aline += ["N", cspan, "fragment", "yes"] is_gap = True if cspan <= 0: continue sum_of_spans += cspan aline = "\t".join(str(x) for x in aline) if not (opts.split and is_gap): agp_fixes[component].append(aline) if opts.log: print >> fwlog, aline #assert component_span == sum_of_spans if opts.log: print >> fwlog # Finally write the masked agp for a in agp: if not a.is_gap and a.component_id in agp_fixes: print >> fw, "\n".join(agp_fixes[a.component_id]) else: print >> fw, a fw.close() # Reindex idxagpfile = reindex([newagpfile]) shutil.move(idxagpfile, newagpfile) return newagpfile