def combine_files(file_list, annotation_file=None, polarity_list=None, print_time=False, quality_control=False, return_id='entrez', normalization=None): """Parse feature extraction files. This function combines multiple technical replicates at the RNA level into a single experiment. Typically multiple replicates will be used when dye-swapping is employed. The combined values are collapsed to entrez gene ids file_list: A list of feature extraction file names. Or a single file. A list is only provided if the values are to be combined across all files. annotation_file: The agilent annotation file corresponding to the feature extraction files in file_list. polarity_list: A list of integers (1 or -1) indicating the polarity of the experiment. 1 indicates that the red channel is divided by the green channel and -1 indicates to divide the green channel by the red channel return_id: String. Either 'entrez' or 'accession'. If 'entrez' then an annotation file must be supplied to convert from the accession ids on the chip to the entrez ids. normalization: None or 'lowess' """ if print_time: start_time = time() parse_file_return_id = 'accession' if return_id.lower() == 'probe': parse_file_return_id = return_id.lower() if return_id.lower() == 'entrez': if annotation_file is None: raise Exception("An annotation file must be supplied if you want " +\ "entrez ids") accession_to_entrez = parse_annotation(annotation_file) if print_time: print '%s %f'%('annotation time', time() - start_time) start_time = time() parsed_files = [] if not hasattr(file_list, '__iter__'): file_list = [file_list] if not hasattr(polarity_list, '__iter__'): if polarity_list is None: polarity_list = [1]*len(file_list) else: polarity_list = list(polarity_list) #Extract the values to load into the database [parsed_files.append(parse_file(the_file, the_polarity, quality_control=quality_control, normalization=normalization, return_id=parse_file_return_id)) for the_file, the_polarity in zip(file_list, polarity_list)] if print_time: print '%s %f'%('parse time', time() - start_time) start_time = time() #If quality filtering is on then some genes may not exist in #both files. the_keys = set(parsed_files[0]).union(parsed_files[1]) if return_id.lower() == 'entrez': #Only look at items that are in the annotation file the_keys = the_keys.intersection(accession_to_entrez) combined_data = {} the_data_fields = parsed_files[0].values()[0].keys() #Merge the results for each field across files into a list for the_key in the_keys: tmp_dict = combined_data[the_key] = defaultdict(list) for data_dict in parsed_files: #If quality filtering is on then some genes may not exist in #both files. try: data_dict = data_dict[the_key] except: continue [tmp_dict[the_field].append(data_dict[the_field]) for the_field in the_data_fields] if print_time: if len(file_list) == 2 and 'log_ratio' in the_data_fields: tmp_array = array([v['log_ratio'] for v in combined_data.itervalues() if len(v['log_ratio']) ==2]) the_correlation = pearsonr(tmp_array[:, 0], tmp_array[:, 1]) print 'Correlation for combined channels: %1.2f'%round(the_correlation[0], 2) print '%s %f'%('combine time', time() - start_time) start_time = time() #Collapse the lists for each field for each probe. [collapse_fields(v) for v in combined_data.values()] if print_time: print '%s %f'%('collapse time', time() - start_time) start_time = time() if return_id.lower() == 'entrez': #Now get the entrez ids for inserting into the database. the_entrez_ids = set([accession_to_entrez[the_key] for the_key in the_keys]) #Now need to collapse to entrez gene ids entrez_data = dict([(k, defaultdict(list)) for k in the_entrez_ids]) for the_id, the_dict in combined_data.items(): tmp_dict = entrez_data[accession_to_entrez[the_id]] [tmp_dict[k].append(v) for k, v in the_dict.items()] #Now collapse the instances where the ids are have repeats. [collapse_fields(v) for v in entrez_data.values()] if print_time: print '%s %f'%('collapse to entrez and combine time', time() - start_time) combined_data = entrez_data # return combined_data
def parse_file(in_file, polarity=1, quality_control=True, return_id='accession', normalization=None, lowess_parameter=0.33, log_base=10, single_channel=False, single_field='gProcessedSignal'): """Extract data from feature extraction >=9.5 files. Returns the average log ratios for each return_id. in_file: String. Name of input file. polarity: 1 or -1. Indicates whether to do red over green (1) or green over red. If normalization isn't performed then the polarity is multiplied by the log ratio. return_id: 'accession' or 'probe' normalization: 'lowess' or None lowess_parameter: Float. Smoothing parameter for lowess normalization TODO: Add in a parameter to exclude based on coefficient of variation. """ ## in_file = '/Users/danie/tg/Work/Li_et_al/Data/Feature_Extraction/for GEO/Oxam+.txt' ## polarity = 1 ## quality_control = True ## return_id = 'accession' ## normalization = 'lowess' ## lowess_parameter = 0.33 ## log_base = 10 channel_prefixes = ['r','g'] if single_channel: channel_prefixes = [x for x in channel_prefixes if single_field.startswith(x)] with open(in_file) as in_file_handle: the_header = in_file_handle.readline().rstrip('\r\n').split('\t') while not the_header[0] == 'FEATURES': the_header = in_file_handle.readline().rstrip('\r\n').split('\t') #parse the rows. skip the non-data spots the_data = [x.rstrip('\r\n').split('\t') for x in in_file_handle.readlines() if 'GE_BrightCorner' not in x and 'DarkCorner' not in x] if return_id.lower() == 'accession': gene_index = the_header.index('SystematicName') elif return_id.lower() == 'probe': gene_index = the_header.index('ProbeName') else: raise Exception("return_id must be 'accession' or 'probe' not '%s'"%return_id) if quality_control: #Get the column indices for the quality control statistics and #assign the value that must be met to pass the test. quality_control_indices = {} [quality_control_indices.update({the_header.index(k): '1'}) for k in map(lambda x: x + 'IsFound', channel_prefixes) if k in the_header] #1 means the spot is found [[quality_control_indices.update({the_header.index(k): '0'}) for k in map(lambda x: x + y, channel_prefixes) if k in the_header] for y in ['IsSaturated', 'IsFeatNonUnifOL']] #0 means the spot is not saturatd and not a nonuniform outlier quality_control_indices[the_header.index('IsManualFlag')] = '0' #0 means it wasn't flagged by the user. filtered_data = [] flagged_rows = [] for the_row in the_data: the_row_is_good = True for the_index, passing_flag in quality_control_indices.items(): if the_row[the_index] != passing_flag: the_row_is_good = False break if the_row_is_good: filtered_data.append(the_row) else: flagged_rows.append(the_row) the_data = filtered_data #Use the filtered data from here on out if single_channel: #Single channel experiments don't have ratios or polarities column_to_index = {'intensity_1': the_header.index(single_field)} else: column_to_index = {'log_ratio': the_header.index('LogRatio'), 'log_error': the_header.index('LogRatioError'), 'p_value': the_header.index('PValueLogRatio'), 'intensity_1': the_header.index('gProcessedSignal'), 'intensity_2': the_header.index('rProcessedSignal'), #The last two are in case lowess normalization needs to be #performed. 'background_subtracted_1': the_header.index('gBGSubSignal'), 'background_subtracted_2': the_header.index('rBGSubSignal')} for the_row in the_data: #Speed this up the_row[column_to_index['log_ratio']] = polarity*float(the_row[column_to_index['log_ratio']]) if polarity == -1: #Change the polarity if requested. Doing so here makes it easier #to run the calculations downstream. column_to_index.update({'intensity_1': the_header.index('rProcessedSignal'), 'intensity_2': the_header.index('gProcessedSignal'), 'background_subtracted_1': the_header.index('rBGSubSignal'), 'background_subtracted_2': the_header.index('gBGSubSignal')}) #These need to be popped because they're only used during lowess normalization channel_2_index = column_to_index.pop('background_subtracted_2') channel_1_index = column_to_index.pop('background_subtracted_1') #Apply lowess normalization to double channel data if requested. if normalization is not None and normalization.lower() == 'lowess': if single_channel: raise Exception('Lowess normalization does not work with single channel arrays') warn('Lowess Normalization looks off, please correct') #Polarity is already adjusted above log_ratio_index = column_to_index['log_ratio'] #Now that we're not using the processed values we need to do change intensity #indices to use the unprocessed intensities. column_to_index['intensity_1'] = channel_1_index column_to_index['intensity_2'] = channel_2_index channel_2_signal = [] channel_1_signal = [] data_to_normalize = [] #We can't take log ratios of 0 or divide by 0 for the_row in the_data: channel_1_value = float(the_row[channel_1_index]) channel_2_value = float(the_row[channel_2_index]) if channel_1_value > 0. and channel_2_value > 0.: data_to_normalize.append(the_row) channel_1_signal.append(channel_1_value) channel_2_signal.append(channel_2_value) the_data = data_to_normalize #Now perform the lowess normalization channel_2_signal = array(channel_2_signal) channel_1_signal = array(channel_1_signal) a = log10(channel_2_signal*channel_1_signal)/2. m = log10(channel_2_signal/channel_1_signal) lowess_fit = array(r.lowess(a, m, lowess_parameter)).T m = m - lowess_fit[:,1] #Now update the_data list to use the normalized values for the_row, the_log_ratio in zip(the_data, list(m)): the_row[log_ratio_index] = float(the_log_ratio) #Make a dictionary to return the requested values gene_dict = dict([(x[gene_index], defaultdict(list)) for x in the_data]) for the_row in the_data: the_dict = gene_dict[the_row[gene_index]] [the_dict[k].append(float(the_row[v])) for k, v in column_to_index.items()] #Now combined the items SPEED THIS UP [collapse_fields(v) for v in gene_dict.values()] return gene_dict