def group(args): """ %prog group tabfile > tabfile.grouped Given a tab-delimited file, either group all elements within the file or group the elements in the value column(s) based on the key (groupby) column For example, convert this | into this --------------------------------------- a 2 3 4 | a,2,3,4,5,6 a 5 6 | b,7,8 b 7 8 | c,9,10,11 c 9 | c 10 11 | If grouping by a particular column, convert this | into this: --------------------------------------------- a 2 3 4 | a 2,5 3,6 4 a 5 6 | b 7 8 b 7 8 | c 9,10 11 c 9 | c 10 11 | By default, it uniqifies all the grouped elements """ from jcvi.utils.cbook import AutoVivification from jcvi.utils.grouper import Grouper p = OptionParser(group.__doc__) p.set_sep() p.add_option("--groupby", default=None, type="int", help="Default column to groupby") p.add_option("--groupsep", default=",", help="Separator to join the grouped elements") p.add_option( "--nouniq", default=False, action="store_true", help="Do not uniqify the grouped elements", ) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (tabfile, ) = args sep = opts.sep groupby = opts.groupby groupsep = opts.groupsep cols = [] grouper = AutoVivification() if groupby is not None else Grouper() fp = must_open(tabfile) for row in fp: row = row.rstrip() atoms = row.split(sep) if groupby is not None: if len(cols) < len(atoms): cols = [x for x in range(len(atoms))] if groupby not in cols: logging.error( "groupby col index `{0}` is out of range".format(groupby)) sys.exit() key = atoms[groupby] for col in cols: if col == groupby: continue if not grouper[key][col]: grouper[key][col] = [] if opts.nouniq else set() if col < len(atoms): if groupsep in atoms[col]: for atom in atoms[col].split(groupsep): if opts.nouniq: grouper[key][col].append(atom) else: grouper[key][col].add(atom) else: if opts.nouniq: grouper[key][col].append(atoms[col]) else: grouper[key][col].add(atoms[col]) else: grouper.join(*atoms) for key in grouper: if groupby is not None: line = [] for col in cols: if col == groupby: line.append(key) elif col in grouper[key].keys(): line.append(groupsep.join(grouper[key][col])) else: line.append("na") print(sep.join(line)) else: print(groupsep.join(key))
def group(args): """ %prog group tabfile > tabfile.grouped Given a tab-delimited file, either group all elements within the file or group the elements in the value column(s) based on the key (groupby) column For example, convert this | into this --------------------------------------- a 2 3 4 | a,2,3,4,5,6 a 5 6 | b,7,8 b 7 8 | c,9,10,11 c 9 | c 10 11 | If grouping by a particular column, convert this | into this: --------------------------------------------- a 2 3 4 | a 2,5 3,6 4 a 5 6 | b 7 8 b 7 8 | c 9,10 11 c 9 | c 10 11 | By default, it uniqifies all the grouped elements """ from jcvi.utils.cbook import AutoVivification from jcvi.utils.grouper import Grouper p = OptionParser(group.__doc__) p.set_sep() p.add_option("--groupby", default=None, type='int', help="Default column to groupby [default: %default]") p.add_option("--groupsep", default=',', help="Separator to join the grouped elements [default: `%default`]") p.add_option("--nouniq", default=False, action="store_true", help="Do not uniqify the grouped elements [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) tabfile, = args sep = opts.sep groupby = opts.groupby groupsep = opts.groupsep cols = [] grouper = AutoVivification() if groupby is not None else Grouper() fp = must_open(tabfile) for row in fp: row = row.rstrip() atoms = row.split(sep) if groupby is not None: if len(cols) < len(atoms): cols = [x for x in xrange(len(atoms))] if groupby not in cols: logging.error("groupby col index `{0}` is out of range".format(groupby)) sys.exit() key = atoms[groupby] for col in cols: if col == groupby: continue if not grouper[key][col]: grouper[key][col] = [] if opts.nouniq else set() if col < len(atoms): if groupsep in atoms[col]: for atom in atoms[col].split(groupsep): if opts.nouniq: grouper[key][col].append(atom) else: grouper[key][col].add(atom) else: if opts.nouniq: grouper[key][col].append(atoms[col]) else: grouper[key][col].add(atoms[col]) else: grouper.join(*atoms) for key in grouper: if groupby is not None: line = [] for col in cols: if col == groupby: line.append(key) elif col in grouper[key].keys(): line.append(groupsep.join(grouper[key][col])) else: line.append("na") print sep.join(line) else: print groupsep.join(key)