def importGroups(fn): try: group_db=collections.OrderedDict() except Exception: try: import ordereddict group_db=ordereddict.OrderedDict() except Exception: group_db={} for line in open(fn,'rU').xreadlines(): data = ExpressionBuilder.cleanUpLine(line) sample_filename,group_number,group_name = string.split(data,'\t') try: group_db[group_name].append(sample_filename) except Exception: group_db[group_name] = [sample_filename] return group_db
def performGroupNormalization(filename,export_dir,platform): expressionDataFormat,increment,convertNonLogToLog = ExpressionBuilder.checkExpressionFileFormat(filename) groups_dir = string.replace(export_dir,'exp.','batch.') fn=unique.filepath(filename); row_number=0; exp_db={}; relative_headers_exported = False group_db = importGroups(groups_dir) export_data = export.ExportFile(export_dir) for line in open(fn,'rU').xreadlines(): data = ExpressionBuilder.cleanUpLine(line) t = string.split(data,'\t') if data[0]=='#' and row_number==0: row_number = 0 elif row_number==0: sample_list = t[1:] new_sample_list = [] for group in group_db: group_samples = group_db[group] try: sample_index_list = map(lambda x: sample_list.index(x), group_samples) group_db[group] = sample_index_list new_sample_list+=group_samples except Exception: missing=[] for x in sample_list: if x not in t[1:]: missing.append(x) print 'missing:',missing print t print sample_list print filename, groups_dir print 'Unknown Error!!! Skipping cluster input file build (check column and row formats for conflicts)'; forceExit title = string.join([t[0]]+new_sample_list,'\t')+'\n' ### output the new sample order (group file order) export_data.write(title) row_number=1 else: gene = t[0] if expressionDataFormat == 'non-log' and (convertNonLogToLog or platform == 'RNASeq'): ### Convert to log2 RPKM values - or counts try: all_values = map(lambda x: math.log(float(x)+increment,2), t[1:]) except Exception: all_values = ExpressionBuilder.logTransformWithNAs(t[1:],increment) else: try: all_values = map(float,t[1:]) except Exception: all_values = ExpressionBuilder.logTransformWithNAs(t[1:],increment) row_number+=1 ### Keep track of the first gene as to write out column headers for the relative outputs gene_log_folds = [] for group in group_db: sample_index_list = group_db[group] ### Calculate log-fold values relative to the mean of all sample expression values try: values = map(lambda x: all_values[x], sample_index_list) ### simple and fast way to reorganize the samples except Exception: print len(values), sample_index_list;kill try: avg = statistics.avg(values) except Exception: values2=[] for v in values: try: values2.append(float(v)) except Exception: pass values = values2 try: avg = statistics.avg(values) except Exception: if len(values)>0: avg = values[0] else: avg = 0 try: log_folds = map(lambda x: (x-avg), values) except Exception: log_folds=[] for x in values: try: log_folds.append(x-avg) except Exception: log_folds.append('') gene_log_folds+=log_folds gene_log_folds = map(lambda x: str(x),gene_log_folds) export_data.write(string.join([gene]+gene_log_folds,'\t')+'\n') export_data.close()