Ejemplo n.º 1
0
def merge(args):
    """
    %prog merge folder1 ...

    Consolidate split contents in the folders. The folders can be generated by
    the split() process and several samples may be in separate fastq files. This
    program merges them.
    """
    p = OptionParser(merge.__doc__)
    p.add_option("--outdir",
                 default="outdir",
                 help="Output final reads in [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    folders = args
    outdir = opts.outdir
    mkdir(outdir)

    files = flatten(glob("{0}/*.*.fastq".format(x)) for x in folders)
    files = list(files)
    key = lambda x: op.basename(x).split(".")[0]
    files.sort(key=key)
    for id, fns in groupby(files, key=key):
        fns = list(fns)
        outfile = op.join(outdir, "{0}.fastq".format(id))
        FileMerger(fns, outfile=outfile).merge(checkexists=True)
Ejemplo n.º 2
0
def merge(args):
    """
    %prog merge folder1 ...

    Consolidate split contents in the folders. The folders can be generated by
    the split() process and several samples may be in separate fastq files. This
    program merges them.
    """
    p = OptionParser(merge.__doc__)
    p.set_outdir(outdir="outdir")
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    folders = args
    outdir = opts.outdir
    mkdir(outdir)

    files = flatten(glob("{0}/*.*.fastq".format(x)) for x in folders)
    files = list(files)
    key = lambda x: op.basename(x).split(".")[0]
    files.sort(key=key)
    for id, fns in groupby(files, key=key):
        fns = list(fns)
        outfile = op.join(outdir, "{0}.fastq".format(id))
        FileMerger(fns, outfile=outfile).merge(checkexists=True)
Ejemplo n.º 3
0
def notify(args):
    """
    %prog notify "Message to be sent"

    Send a message via email/push notification.

    Email notify: Recipient email address is constructed by joining the login `username`
    and `dnsdomainname` of the server

    Push notify: Uses available API
    """
    from jcvi.utils.iter import flatten

    valid_notif_methods.extend(available_push_api.keys())

    fromaddr, toaddr = get_email_address()

    p = OptionParser(notify.__doc__)
    p.add_option("--method",
                 default="email",
                 choices=valid_notif_methods,
                 help="Specify the mode of notification [default: %default]")
    p.add_option("--subject",
                 default="JCVI: job monitor",
                 help="Specify the subject of the notification message")

    p.set_email()

    g1 = OptionGroup(p, "Optional `push` parameters")
    g1.add_option("--api", default="pushover", \
                  choices=list(flatten(available_push_api.values())),
                  help="Specify API used to send the push notification")
    g1.add_option("--priority",
                  default=0,
                  type="int",
                  help="Message priority (-1 <= p <= 2) [default: %default]")
    g1.add_option("--timestamp", default=None, type="int", \
                  dest="timestamp", \
                  help="Message timestamp in unix format [default: %default]")
    p.add_option_group(g1)

    opts, args = p.parse_args(args)

    if len(args) == 0:
        logging.error("Please provide a brief message to be sent")
        sys.exit(not p.print_help())

    subject = opts.subject
    message = " ".join(args).strip()

    if opts.method == "email":
        if not is_valid_email(opts.toaddr):
            logging.debug("Email address `{0}` is not valid!".format(
                opts.toaddr))
            sys.exit()
        toaddr = [opts.toaddr]  # TO address should be in a list
        send_email(fromaddr, toaddr, subject, message)
    else:
        pushnotify(subject, message, api=opts.api, priority=opts.priority, \
                   timestamp=opts.timestamp)
Ejemplo n.º 4
0
Archivo: base.py Proyecto: rrane/jcvi
def notify(args):
    """
    %prog notify "Message to be sent"

    Send a message via email/push notification.

    Email notify: Recipient email address is constructed by joining the login `username`
    and `dnsdomainname` of the server

    Push notify: Uses available API
    """
    from jcvi.utils.iter import flatten

    debug()
    valid_notif_methods.extend(available_push_api.keys())

    fromaddr, toaddr = get_email_address()

    p = OptionParser(notify.__doc__)
    p.add_option("--method", default="email", choices=valid_notif_methods,
                 help="Specify the mode of notification [default: %default]")
    p.add_option("--subject", default="JCVI: job monitor",
                 help="Specify the subject of the notification message")

    p.set_email()

    g1 = OptionGroup(p, "Optional `push` parameters")
    g1.add_option("--api", default="pushover", \
                  choices=list(flatten(available_push_api.values())),
                  help="Specify API used to send the push notification" + \
                  " [default: %default]")
    g1.add_option("--priority", default=0, type="int",
                  help="Message priority (-1 <= p <= 2) [default: %default]")
    g1.add_option("--timestamp", default=None, type="int", \
                  dest="timestamp", \
                  help="Message timestamp in unix format [default: %default]")
    p.add_option_group(g1)

    opts, args = p.parse_args(args)

    if len(args) == 0:
        logging.error("Please provide a brief message to be sent")
        sys.exit(not p.print_help())

    subject = opts.subject
    message = " ".join(args).strip()

    if opts.method == "email":
        if not is_valid_email(opts.toaddr):
            logging.debug("Email address `{0}` is not valid!".format(opts.toaddr))
            sys.exit()
        toaddr = [opts.toaddr]   # TO address should be in a list
        send_email(fromaddr, toaddr, subject, message)
    else:
        pushnotify(subject, message, api=opts.api, priority=opts.priority, \
                   timestamp=opts.timestamp)
Ejemplo n.º 5
0
Archivo: base.py Proyecto: bennyyu/jcvi
def join(args):
    """
    %prog join file1.txt file2.txt ..

    Join tabular files based on common column. --column specifies the column
    index to pivot on. Use comma to separate multiple values if the pivot column
    is different in each file. Maintain the order in the first file.
    """
    from jcvi.utils.iter import flatten

    p = OptionParser(join.__doc__)
    p.add_option("--column", default="0",
                 help="0-based column id, multiple values allowed [default: %default]")
    p.add_option("--noheader", default=False, action="store_true",
                 help="Do not print header [default: %default]")
    set_outfile(p)

    opts, args = p.parse_args(args)
    nargs = len(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    c = opts.column
    if "," in c:
        cc = [int(x) for x in c.split(",")]
    else:
        cc = [int(c)] * nargs

    assert len(cc) == nargs, "Column index number != File number"

    # Maintain the first file line order, and combine other files into it
    pivotfile = args[0]
    files = [DictFile(f, keypos=c, valuepos=None, delimiter="\t") \
                        for f, c in zip(args, cc)]
    otherfiles = files[1:]
    header = "\t".join(flatten([op.basename(x.filename)] * x.ncols \
                        for x in files))

    fp = open(pivotfile)
    fw = must_open(opts.outfile, "w")
    if not opts.noheader:
        print >> fw, header

    for row in fp:
        row = row.rstrip()
        atoms = row.split("\t")
        newrow = atoms
        key = atoms[cc[0]]
        for d in otherfiles:
            drow = d.get(key, ["na"] * d.ncols)
            newrow += drow
        print >> fw, "\t".join(newrow)
Ejemplo n.º 6
0
def join(args):
    """
    %prog join file1.txt file2.txt ..

    Join tabular files based on common column. --column specifies the column
    index to pivot on. Use comma to separate multiple values if the pivot column
    is different in each file. Maintain the order in the first file.
    """
    from jcvi.utils.iter import flatten

    p = OptionParser(join.__doc__)
    p.add_option("--column", default="0",
                 help="0-based column id, multiple values allowed [default: %default]")
    p.add_option("--noheader", default=False, action="store_true",
                 help="Do not print header [default: %default]")
    set_outfile(p)

    opts, args = p.parse_args(args)
    nargs = len(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    c = opts.column
    if "," in c:
        cc = [int(x) for x in c.split(",")]
    else:
        cc = [int(c)] * nargs

    assert len(cc) == nargs, "Column index number != File number"

    # Maintain the first file line order, and combine other files into it
    pivotfile = args[0]
    files = [DictFile(f, keypos=c, valuepos=None, delimiter="\t") \
                        for f, c in zip(args, cc)]
    otherfiles = files[1:]
    header = "\t".join(flatten([op.basename(x.filename)] * x.ncols \
                        for x in files))

    fp = open(pivotfile)
    fw = must_open(opts.outfile, "w")
    if not opts.noheader:
        print >> fw, header

    for row in fp:
        row = row.rstrip()
        atoms = row.split("\t")
        newrow = atoms
        key = atoms[cc[0]]
        for d in otherfiles:
            drow = d.get(key, ["na"] * d.ncols)
            newrow += drow
        print >> fw, "\t".join(newrow)
Ejemplo n.º 7
0
    def fix_orientation(self, tour):
        """
        Test each scaffold if flipping will increass longest monotonic chain
        length.
        """
        orientations = dict(tour)  # old configuration here
        scaffold_oo = defaultdict(list)
        scaffolds, oos = zip(*tour)
        for mlg in self.linkage_groups:
            lg = mlg.lg
            mapname = mlg.mapname
            for s, o in tour:
                i = scaffolds.index(s)
                L = [self.get_series(lg, x, xo) for x, xo in tour[:i]]
                U = [self.get_series(lg, x, xo) for x, xo in tour[i + 1:]]
                L, U = list(flatten(L)), list(flatten(U))
                M = self.get_series(lg, s)
                plus = lms(L + M + U)
                minus = lms(L + M[::-1] + U)
                d = plus[0] - minus[0]
                if not d:
                    continue
                scaffold_oo[s].append((d, mapname))  # reset orientation

        fixed = 0
        for s, v in scaffold_oo.items():
            d = self.weighted_mean(v)
            old_d = orientations[s]
            new_d = np.sign(d)
            if new_d != old_d:
                orientations[s] = new_d
                fixed += 1

        tour = [(x, orientations[x]) for x in scaffolds]
        logging.debug("Fixed orientations for {0} scaffolds.".format(fixed))
        return tour
Ejemplo n.º 8
0
    def fix_orientation(self, tour):
        """
        Test each scaffold if flipping will increass longest monotonic chain
        length.
        """
        orientations = dict(tour)  # old configuration here
        scaffold_oo = defaultdict(list)
        scaffolds, oos = zip(*tour)
        for mlg in self.linkage_groups:
            lg = mlg.lg
            mapname = mlg.mapname
            for s, o in tour:
                i = scaffolds.index(s)
                L = [self.get_series(lg, x, xo) for x, xo in tour[:i]]
                U = [self.get_series(lg, x, xo) for x, xo in tour[i + 1:]]
                L, U = list(flatten(L)), list(flatten(U))
                M = self.get_series(lg, s)
                plus = lms(L + M + U)
                minus = lms(L + M[::-1] + U)
                d = plus[0] - minus[0]
                if not d:
                    continue
                scaffold_oo[s].append((d, mapname))  # reset orientation

        fixed = 0
        for s, v in scaffold_oo.items():
            d = self.weighted_mean(v)
            old_d = orientations[s]
            new_d = np.sign(d)
            if new_d != old_d:
                orientations[s] = new_d
                fixed += 1

        tour = [(x, orientations[x]) for x in scaffolds]
        logging.debug("Fixed orientations for {0} scaffolds.".format(fixed))
        return tour
Ejemplo n.º 9
0
def make_attributes(s, gff3=True):
    """
    In GFF3, the last column is typically:
    ID=cds00002;Parent=mRNA00002;

    In GFF2, the last column is typically:
    Gene 22240.t000374; Note "Carbonic anhydrase"
    """
    if gff3:
        d = parse_qs(s)

    else:
        attributes = s.split("; ")
        d = DefaultOrderedDict(list)
        for a in attributes:
            key, val = a.strip().split(' ', 1)
            val = val.replace('"', '')
            d[key].append(val)

    for key, val in d.items():
        d[key] = list(flatten([v.split(",") for v in val]))

    return d
Ejemplo n.º 10
0
def make_attributes(s, gff3=True):
    """
    In GFF3, the last column is typically:
    ID=cds00002;Parent=mRNA00002;

    In GFF2, the last column is typically:
    Gene 22240.t000374; Note "Carbonic anhydrase"
    """
    if gff3:
        d = parse_qs(s)

    else:
        attributes = s.split("; ")
        d = DefaultOrderedDict(list)
        for a in attributes:
            key, val = a.strip().split(' ', 1)
            val = val.replace('"', '')
            d[key].append(val)

    for key, val in d.items():
        d[key] = list(flatten([v.split(",") for v in val]))

    return d
Ejemplo n.º 11
0
def waitpid(args):
    """
    %prog waitpid PID ::: "./command_to_run param1 param2 ...."

    Given a PID, this script will wait for the PID to finish running and
    then perform a desired action (notify user and/or execute a new command)

    Specify "--notify=METHOD` to send the user a notification after waiting for PID
    Specify `--grid` option to send the new process to the grid after waiting for PID
    """
    import shlex
    from jcvi.utils.iter import flatten

    valid_notif_methods.extend(list(flatten(available_push_api.values())))

    p = OptionParser(waitpid.__doc__)
    p.add_option("--notify",
                 default="email",
                 choices=valid_notif_methods,
                 help="Specify type of notification to be sent after waiting")
    p.add_option("--interval",
                 default=120,
                 type="int",
                 help="Specify PID polling interval in seconds")
    p.add_option("--message",
                 help="Specify notification message [default: %default]")
    p.set_email()
    p.set_grid()
    opts, args = p.parse_args(args)

    if len(args) == 0:
        sys.exit(not p.print_help())

    if not opts.message:
        """
        If notification message not specified by user, just get
        the name of the running command and use it as the message
        """
        from subprocess import check_output

    sep = ":::"
    cmd = None
    if sep in args:
        sepidx = args.index(sep)
        cmd = " ".join(args[sepidx + 1:]).strip()
        args = args[:sepidx]

    pid = int(" ".join(args).strip())

    status = pid_exists(pid)
    if status:
        if opts.message:
            msg = opts.message
        else:
            get_origcmd = "ps -p {0} -o cmd h".format(pid)
            msg = check_output(shlex.split(get_origcmd)).strip()
        _waitpid(pid, interval=opts.interval)
    else:
        logging.debug("Process with PID {0} does not exist".format(pid))
        sys.exit()

    if opts.notify:
        notifycmd = ["[{0}] `{1}`".format(gethostname(), msg)]
        if opts.notify != "email":
            notifycmd.append("--method={0}".format("push"))
            notifycmd.append("--api={0}".format(opts.notify))
        else:
            notifycmd.append('--email={0}'.format(opts.email))
        notify(notifycmd)

    if cmd is not None:
        bg = False if opts.grid else True
        sh(cmd, grid=opts.grid, background=bg)
Ejemplo n.º 12
0
def join(args):
    """
    %prog join file1.txt(pivotfile) file2.txt ..

    Join tabular-like files based on common column.
    --column specifies the column index to pivot on.
      Use comma to separate multiple values if the pivot column is different
      in each file. Maintain the order in the first file.
    --sep specifies the column separators, default to tab.
      Use comma to separate multiple values if the column separator is different
      in each file.
    """
    from jcvi.utils.iter import flatten

    p = OptionParser(join.__doc__)
    p.add_option("--column", default="0",
                 help="0-based column id, multiple values allowed [default: %default]")
    p.set_sep(multiple=True)
    p.add_option("--noheader", default=False, action="store_true",
                 help="Do not print header [default: %default]")
    p.add_option("--na", default="na",
                 help="Value for unjoined data [default: %default]")
    p.add_option("--keysep", default=",",
                 help="specify separator joining multiple elements in the key column"
                 + " of the pivot file [default: %default]")
    p.set_outfile()

    opts, args = p.parse_args(args)
    nargs = len(args)

    keysep = opts.keysep

    if len(args) < 2:
        sys.exit(not p.print_help())

    na = opts.na
    c = opts.column
    if "," in c:
        cc = [int(x) for x in c.split(",")]
    else:
        cc = [int(c)] * nargs

    assert len(cc) == nargs, "Column index number != File number"

    s = opts.sep
    if "," in s:
        ss = [x for x in s.split(",")]
    else:
        ss = [s] * nargs

    assert len(ss) == nargs, "column separator number != File number"

    # Maintain the first file line order, and combine other files into it
    pivotfile = args[0]
    files = [DictFile(f, keypos=c, valuepos=None, delimiter=s) \
                        for f, c, s in zip(args, cc, ss)]
    otherfiles = files[1:]
    header = "\t".join(flatten([op.basename(x.filename)] * x.ncols \
                        for x in files))

    fp = must_open(pivotfile)
    fw = must_open(opts.outfile, "w")
    if not opts.noheader:
        print >> fw, header

    for row in fp:
        row = row.rstrip()
        atoms = row.split(ss[0])
        newrow = atoms
        key = atoms[cc[0]]
        keys = key.split(keysep) if keysep in key else [key]
        for d in otherfiles:
            drows = list()
            for key in keys:
                drows.append(d.get(key, [na] * d.ncols))
            drow = [keysep.join(x) for x in list(zip(*drows))]
            newrow += drow
        print >> fw, "\t".join(newrow)
Ejemplo n.º 13
0
def waitpid(args):
    """
    %prog waitpid PID ::: "./command_to_run param1 param2 ...."

    Given a PID, this script will wait for the PID to finish running and
    then perform a desired action (notify user and/or execute a new command)

    Specify "--notify=METHOD` to send the user a notification after waiting for PID
    Specify `--grid` option to send the new process to the grid after waiting for PID
    """
    import shlex
    from time import sleep
    from jcvi.utils.iter import flatten

    valid_notif_methods.extend(list(flatten(available_push_api.values())))

    p = OptionParser(waitpid.__doc__)
    p.add_option("--notify", default=None, choices=valid_notif_methods,
                 help="Specify type of notification to be sent after waiting")
    p.add_option("--interval", default=120, type="int",
                 help="Specify PID polling interval in seconds")
    p.add_option("--message",
                help="Specify notification message [default: %default]")
    p.set_email()
    p.set_grid()
    opts, args = p.parse_args(args)

    if len(args) == 0:
        sys.exit(not p.print_help())

    if not opts.message:
        """
        If notification message not specified by user, just get
        the name of the running command and use it as the message
        """
        from subprocess import check_output

    sep = ":::"
    cmd = None
    if sep in args:
        sepidx = args.index(sep)
        cmd = " ".join(args[sepidx + 1:]).strip()
        args = args[:sepidx]

    pid = int(" ".join(args).strip())

    status = is_running(pid)
    if status:
        if opts.message:
            msg = opts.message
        else:
            get_origcmd = "ps -p {0} -o cmd h".format(pid)
            msg = check_output(shlex.split(get_origcmd)).strip()
        while is_running(pid):
            sleep(opts.interval)
    else:
        logging.debug("Process with PID {0} does not exist".format(pid))
        sys.exit()

    if opts.notify:
        notifycmd = ["[completed] {0}: `{1}`".format(gethostname(), msg)]
        if opts.notify != "email":
            notifycmd.append("--method={0}".format("push"))
            notifycmd.append("--api={0}".format(opts.notify))
        else:
            notifycmd.append('--email="{0}"'.format(opts.email))
        notify(notifycmd)

    if cmd is not None:
        bg = False if opts.grid else True
        sh(cmd, grid=opts.grid, background=bg)
Ejemplo n.º 14
0
Archivo: agp.py Proyecto: bennyyu/jcvi
def mask(args):
    """
    %prog mask agpfile bedfile

    Mask given ranges in componets to gaps.
    """
    p = OptionParser(mask.__doc__)
    p.add_option("--split", default=False, action="store_true",
                 help="Split object and create new names [default: %default]")
    p.add_option("--log", default=False, action="store_true",
                 help="Write verbose logs to .masklog file [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(p.print_help())

    agpfile, bedfile = args
    agp = AGP(agpfile)
    bed = Bed(bedfile)
    simple_agp = agp.order
    # agp lines to replace original ones, keyed by the component
    agp_fixes = defaultdict(list)

    newagpfile = agpfile.replace(".agp", ".masked.agp")
    logfile = bedfile.replace(".bed", ".masklog")
    fw = open(newagpfile, "w")
    if opts.log:
        fwlog = open(logfile, "w")

    for component, intervals in bed.sub_beds():
        if opts.log:
            print >> fwlog, "\n".join(str(x) for x in intervals)
        i, a = simple_agp[component]
        object = a.object
        component_span = a.component_span
        orientation = a.orientation
        if opts.log:
            print >> fwlog, a

        assert a.component_beg, a.component_end
        arange = a.component_beg, a.component_end

        # Make sure `ivs` contain DISJOINT ranges, and located within `arange`
        ivs = []
        for i in intervals:
            iv = range_intersect(arange, (i.start, i.end))
            if iv is not None:
                ivs.append(iv)

        # Sort the ends of `ivs` as well as the arange
        arange = a.component_beg - 1, a.component_end + 1
        endpoints = sorted(flatten(ivs + [arange]))
        # reverse if component on negative strand
        if orientation == '-':
            endpoints.reverse()

        sum_of_spans = 0
        # assign complements as sequence components
        for i, (a, b) in enumerate(pairwise(endpoints)):
            if orientation == '-':
                a, b = b, a
            if orientation not in ('+', '-'):
                orientation = '+'

            oid = object + "_{0}".format(i / 2) if opts.split else object
            aline = [oid, 0, 0, 0]
            if i % 2 == 0:
                cspan = b - a - 1
                aline += ['D', component, a + 1, b - 1, orientation]
                is_gap = False
            else:
                cspan = b - a + 1
                aline += ["N", cspan, "fragment", "yes"]
                is_gap = True
            if cspan <= 0:
                continue

            sum_of_spans += cspan
            aline = "\t".join(str(x) for x in aline)
            if not (opts.split and is_gap):
                agp_fixes[component].append(aline)

            if opts.log:
                print >> fwlog, aline

        assert component_span == sum_of_spans
        if opts.log:
            print >> fwlog

    # Finally write the masked agp
    for a in agp:
        if not a.is_gap and a.component_id in agp_fixes:
            print >> fw, "\n".join(agp_fixes[a.component_id])
        else:
            print >> fw, a

    fw.close()
    # Reindex
    idxagpfile = reindex([newagpfile])
    shutil.move(idxagpfile, newagpfile)

    return newagpfile
Ejemplo n.º 15
0
def join(args):
    """
    %prog join file1.txt(pivotfile) file2.txt ..

    Join tabular-like files based on common column.
    --column specifies the column index to pivot on.
      Use comma to separate multiple values if the pivot column is different
      in each file. Maintain the order in the first file.
    --sep specifies the column separators, default to tab.
      Use comma to separate multiple values if the column separator is different
      in each file.
    """
    from jcvi.utils.iter import flatten

    p = OptionParser(join.__doc__)
    p.add_option(
        "--column",
        default="0",
        help="0-based column id, multiple values allowed [default: %default]")
    p.set_sep(multiple=True)
    p.add_option("--noheader",
                 default=False,
                 action="store_true",
                 help="Do not print header [default: %default]")
    p.add_option("--na",
                 default="na",
                 help="Value for unjoined data [default: %default]")
    p.add_option(
        "--keysep",
        default=",",
        help="specify separator joining multiple elements in the key column" +
        " of the pivot file [default: %default]")
    p.set_outfile()

    opts, args = p.parse_args(args)
    nargs = len(args)

    keysep = opts.keysep

    if len(args) < 2:
        sys.exit(not p.print_help())

    na = opts.na
    c = opts.column
    if "," in c:
        cc = [int(x) for x in c.split(",")]
    else:
        cc = [int(c)] * nargs

    assert len(cc) == nargs, "Column index number != File number"

    s = opts.sep
    if "," in s:
        ss = [x for x in s.split(",")]
    else:
        ss = [s] * nargs

    assert len(ss) == nargs, "column separator number != File number"

    # Maintain the first file line order, and combine other files into it
    pivotfile = args[0]
    files = [DictFile(f, keypos=c, valuepos=None, delimiter=s) \
                        for f, c, s in zip(args, cc, ss)]
    otherfiles = files[1:]
    header = "\t".join(flatten([op.basename(x.filename)] * x.ncols \
                        for x in files))

    fp = must_open(pivotfile)
    fw = must_open(opts.outfile, "w")
    if not opts.noheader:
        print >> fw, header

    for row in fp:
        row = row.rstrip()
        atoms = row.split(ss[0])
        newrow = atoms
        key = atoms[cc[0]]
        keys = key.split(keysep) if keysep in key else [key]
        for d in otherfiles:
            drows = list()
            for key in keys:
                drows.append(d.get(key, [na] * d.ncols))
            drow = [keysep.join(x) for x in list(zip(*drows))]
            newrow += drow
        print >> fw, "\t".join(newrow)
Ejemplo n.º 16
0
def mask(args):
    """
    %prog mask agpfile bedfile

    Mask given ranges in components to gaps.
    """
    p = OptionParser(mask.__doc__)
    p.add_option("--split",
                 default=False,
                 action="store_true",
                 help="Split object and create new names [default: %default]")
    p.add_option(
        "--log",
        default=False,
        action="store_true",
        help="Write verbose logs to .masklog file [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(p.print_help())

    agpfile, bedfile = args
    agp = AGP(agpfile)
    bed = Bed(bedfile)
    simple_agp = agp.order
    # agp lines to replace original ones, keyed by the component
    agp_fixes = defaultdict(list)

    newagpfile = agpfile.replace(".agp", ".masked.agp")
    logfile = bedfile.replace(".bed", ".masklog")
    fw = open(newagpfile, "w")
    if opts.log:
        fwlog = open(logfile, "w")

    for component, intervals in bed.sub_beds():
        if opts.log:
            print >> fwlog, "\n".join(str(x) for x in intervals)
        i, a = simple_agp[component]
        object = a.object
        component_span = a.component_span
        orientation = a.orientation
        if opts.log:
            print >> fwlog, a

        assert a.component_beg, a.component_end
        arange = a.component_beg, a.component_end

        # Make sure `ivs` contain DISJOINT ranges, and located within `arange`
        ivs = []
        for i in intervals:
            iv = range_intersect(arange, (i.start, i.end))
            if iv is not None:
                ivs.append(iv)

        # Sort the ends of `ivs` as well as the arange
        arange = a.component_beg - 1, a.component_end + 1
        endpoints = sorted(flatten(ivs + [arange]))
        # reverse if component on negative strand
        if orientation == '-':
            endpoints.reverse()

        sum_of_spans = 0
        # assign complements as sequence components
        for i, (a, b) in enumerate(pairwise(endpoints)):
            if orientation == '-':
                a, b = b, a
            if orientation not in ('+', '-'):
                orientation = '+'

            oid = object + "_{0}".format(i / 2) if opts.split else object
            aline = [oid, 0, 0, 0]
            if i % 2 == 0:
                cspan = b - a - 1
                aline += ['D', component, a + 1, b - 1, orientation]
                is_gap = False
            else:
                cspan = b - a + 1
                aline += ["N", cspan, "fragment", "yes"]
                is_gap = True
            if cspan <= 0:
                continue

            sum_of_spans += cspan
            aline = "\t".join(str(x) for x in aline)
            if not (opts.split and is_gap):
                agp_fixes[component].append(aline)

            if opts.log:
                print >> fwlog, aline

        #assert component_span == sum_of_spans
        if opts.log:
            print >> fwlog

    # Finally write the masked agp
    for a in agp:
        if not a.is_gap and a.component_id in agp_fixes:
            print >> fw, "\n".join(agp_fixes[a.component_id])
        else:
            print >> fw, a

    fw.close()
    # Reindex
    idxagpfile = reindex([newagpfile])
    shutil.move(idxagpfile, newagpfile)

    return newagpfile