#!/usr/bin/env python ''' cut.py -- cut specified columns from a TSV-formatted file. Assumes the file contains a header line containing column names, of which the columns to be cut are subset. Modify the columns variable below. ''' import sys from datastore.table import Reader transcripts = sys.argv[1:] columns = '''id subject session row time line key p_utts_orig p_utts p_form p_lrb p_obj p_gloss p_orient p_mspd c_utts_orig c_utts c_form c_lrb c_obj c_gloss c_orient c_mspd context'''.split() def pprint(values): print "\t".join(values) pprint(columns) for t in transcripts: T = Reader(t) for row in T.values(*columns): pprint(row)
(19,86), (20,90)]) speech = Reader('speech.xls') ses = Reader('ses.xls') subjects = defaultdict(dict) ses_cols = 'SUBJ SEX EDU INC RACE ETHN'.split() for row in ses: subjects[row['SUBJ']] = row visits = defaultdict(dict) columns = 'subject session speaker word_types'.split() for subj, sess, spkr, wt in speech.values(*columns): age = sess_map[int(sess)] if not visits.has_key((subj, sess)): visits[subj, sess] = {'SUBJ': subj, 'SESS': sess, 'AGE': age, 'CWT': '', 'PWT': ''} if spkr == "child": visits[subj, sess]['CWT'] = wt else: visits[subj, sess]['PWT'] = wt viz_cols = 'SESS AGE PWT CWT'.split() print "\t".join(ses_cols + viz_cols) for id, data in visits.items(): subj, sess = id
from collections import defaultdict ses = Reader('ses.xls') subjects = defaultdict(dict) ses_cols = 'SUBJ SEX EDU INC RACE ETHN'.split() for row in ses: subjects[row['SUBJ']] = row outcomes = Reader('outcomes.tsv') out = defaultdict(dict) columns = 'SUBJ SESS VOCAB READ_WJ READ_GM'.split() for subj, sess, voc, rwj, rgm in outcomes.values(*columns): if not out.has_key(subj): out[subj] = { 'SUBJ': subj, 'VOCB1': '', 'VOCB2': '', 'VOCB3': '', 'VOCB4': '', 'READ1': '', 'READ2': '', 'READ3': '', 'READ4': '', 'READ5': '' } if sess == "5" and voc: out[subj]['VOCB1'] = voc
from datastore.table import Reader r = Reader('ses.tsv') def pprint(args): print "\t".join(args) pprint('SUBJ SEX EDU INC RACE ETHN'.split()) for v in r.values('id', 'sex', 'edu', 'income', 'race', 'ethn'): pprint(v)