Ejemplo n.º 1
0
def group(args):
    """
    %prog group tabfile > tabfile.grouped

    Given a tab-delimited file, either group all elements within the file or
    group the elements in the value column(s) based on the key (groupby) column

    For example, convert this | into this
    ---------------------------------------
    a   2    3    4           | a,2,3,4,5,6
    a   5    6                | b,7,8
    b   7    8                | c,9,10,11
    c   9                     |
    c  10   11                |

    If grouping by a particular column,
    convert this              | into this:
    ---------------------------------------------
    a   2    3    4           | a   2,5   3,6   4
    a   5    6                | b   7     8
    b   7    8                | c   9,10  11
    c   9                     |
    c  10   11                |

    By default, it uniqifies all the grouped elements
    """
    from jcvi.utils.cbook import AutoVivification
    from jcvi.utils.grouper import Grouper

    p = OptionParser(group.__doc__)
    p.set_sep()
    p.add_option("--groupby",
                 default=None,
                 type="int",
                 help="Default column to groupby")
    p.add_option("--groupsep",
                 default=",",
                 help="Separator to join the grouped elements")
    p.add_option(
        "--nouniq",
        default=False,
        action="store_true",
        help="Do not uniqify the grouped elements",
    )
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (tabfile, ) = args
    sep = opts.sep
    groupby = opts.groupby
    groupsep = opts.groupsep

    cols = []
    grouper = AutoVivification() if groupby is not None else Grouper()
    fp = must_open(tabfile)
    for row in fp:
        row = row.rstrip()
        atoms = row.split(sep)
        if groupby is not None:
            if len(cols) < len(atoms):
                cols = [x for x in range(len(atoms))]
            if groupby not in cols:
                logging.error(
                    "groupby col index `{0}` is out of range".format(groupby))
                sys.exit()

            key = atoms[groupby]
            for col in cols:
                if col == groupby:
                    continue
                if not grouper[key][col]:
                    grouper[key][col] = [] if opts.nouniq else set()
                if col < len(atoms):
                    if groupsep in atoms[col]:
                        for atom in atoms[col].split(groupsep):
                            if opts.nouniq:
                                grouper[key][col].append(atom)
                            else:
                                grouper[key][col].add(atom)
                    else:
                        if opts.nouniq:
                            grouper[key][col].append(atoms[col])
                        else:
                            grouper[key][col].add(atoms[col])
        else:
            grouper.join(*atoms)

    for key in grouper:
        if groupby is not None:
            line = []
            for col in cols:
                if col == groupby:
                    line.append(key)
                elif col in grouper[key].keys():
                    line.append(groupsep.join(grouper[key][col]))
                else:
                    line.append("na")
            print(sep.join(line))
        else:
            print(groupsep.join(key))
Ejemplo n.º 2
0
def group(args):
    """
    %prog group tabfile > tabfile.grouped

    Given a tab-delimited file, either group all elements within the file or
    group the elements in the value column(s) based on the key (groupby) column

    For example, convert this | into this
    ---------------------------------------
    a	2    3    4           | a,2,3,4,5,6
    a	5    6                | b,7,8
    b	7    8                | c,9,10,11
    c	9                     |
    c 	10   11               |

    If grouping by a particular column,
    convert this              | into this:
    ---------------------------------------------
    a	2    3    4           | a	2,5   3,6   4
    a	5    6                | b	7     8
    b	7    8                | c	9,10  11
    c	9                     |
    c 	10   11               |

    By default, it uniqifies all the grouped elements
    """
    from jcvi.utils.cbook import AutoVivification
    from jcvi.utils.grouper import Grouper

    p = OptionParser(group.__doc__)
    p.set_sep()
    p.add_option("--groupby", default=None, type='int',
                 help="Default column to groupby [default: %default]")
    p.add_option("--groupsep", default=',',
                 help="Separator to join the grouped elements [default: `%default`]")
    p.add_option("--nouniq", default=False, action="store_true",
                 help="Do not uniqify the grouped elements [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    tabfile, = args
    sep = opts.sep
    groupby = opts.groupby
    groupsep = opts.groupsep

    cols = []
    grouper = AutoVivification() if groupby is not None else Grouper()
    fp = must_open(tabfile)
    for row in fp:
        row = row.rstrip()
        atoms = row.split(sep)
        if groupby is not None:
            if len(cols) < len(atoms):
                cols = [x for x in xrange(len(atoms))]
            if groupby not in cols:
                logging.error("groupby col index `{0}` is out of range".format(groupby))
                sys.exit()

            key = atoms[groupby]
            for col in cols:
                if col == groupby:
                    continue
                if not grouper[key][col]:
                    grouper[key][col] = [] if opts.nouniq else set()
                if col < len(atoms):
                    if groupsep in atoms[col]:
                        for atom in atoms[col].split(groupsep):
                            if opts.nouniq:
                                grouper[key][col].append(atom)
                            else:
                                grouper[key][col].add(atom)
                    else:
                        if opts.nouniq:
                            grouper[key][col].append(atoms[col])
                        else:
                            grouper[key][col].add(atoms[col])
        else:
            grouper.join(*atoms)

    for key in grouper:
        if groupby is not None:
            line = []
            for col in cols:
                if col == groupby:
                    line.append(key)
                elif col in grouper[key].keys():
                    line.append(groupsep.join(grouper[key][col]))
                else:
                    line.append("na")
            print sep.join(line)
        else:
            print groupsep.join(key)