def fetch(self, data=None, headers=None): """Fetch http file from network. Args: headers: {str:str} of additional request HTTP headers data: {str:*} of data to be sent via HTTP Returns: [*str] of file pointer-like HTTP stream. """ # Fetch request. if self.type == "http": rsp = self._fetch_http(data, headers) elif self.type == "ftp": rsp = self._fetch_ftp() else: Log.warning("Unknown type, cannot fetch %s for %s." % self.url, self) return None self.status = 200 # Convert header keys into all lower case. self.headers = {} for key, value in dict(rsp.info()).items(): self.headers[key.lower()] = value self.url_rsp = rsp.geturl() return rsp
def close(self): """Close any open file pointers, close and finalize cache file. """ # Ignore repeated calls to close() if self.closed: Log.info("Redundant call to close(), Ignored for %s." % self) return else: Log.info("Closing %s..." % self) # Handle finalize requests to complete download to buffer. if self.finalize: if not self.completed and self.cache: Log.info("Finalizing download of %s." % self) # Read remaining buffer unconditionally. Use iterator if reporting. if self.report: while True: try: self.next() except StopIteration: break else: self.read() # If not closed in previous read(), try another read(). if not self.closed: # This closes self since the previous read flushed the buffer. self.read() if not self.closed: Log.warning("Close sequence not completed as expected for %s." % self) # Exit: prior reads in the finalize process already closed self. return # self.buffer.close() causes bugs with FTP. Python sockets clean up after # themselves in garbage collection, so to remove the reference to buffer # self.buffer.close() self.buffer = None self.fp_out.close() if self.completed: Log.info("Download complete. %d bytes read." % (self.bytes_read)) # Finalize cache. if self.cache: os.rename(self.tmp_filepath, self.dest_filepath) Log.info("Cache finalized as '%s'." % (self.dest_filepath)) else: Log.info("Download closed before completion. %d bytes read." % \ (self.bytes_read)) # Flush cache. if self.cache: os.remove(self.tmp_filepath) Log.info("Incomplete cache '%s' deleted." % (self.tmp_filepath)) # Flag self as closed to prevent redundant .close() calls. self.closed = True
def __init__(self, gse, merge_cols=True, percentile=.75): """Initialize filter. Requires populated gse. Args: gse: GSE instance associated with row_iter merge_cols: bool if to merge columns if able percentile: float 0<x<=1 of top percent by std to keep """ # 1. Require that GSE is populated and is of correct type. # ========== if not gse.populated: raise geo.NotPopulatedError, "%s must be populated to filter rows." % gse if gse.type != "eQTL": raise geo.StudyTypeMismatch, "%s must be type 'eQTL', not '%s'." % \ (gse, gse.type) # 2. Set Attributes. # ========== self.gse = gse self.col_titles = self.gse.col_titles[:] self.col_map = None self.rows_filtered = [] self.rows_per_gene = {} self.row_stats = {} self.merge_cols = merge_cols self.percentile = percentile # 3. Get column map for column merging. # ========== n_samples = len(self.gse.samples) n_uniques = len(self.gse.subject_gsms) # If there are more samples than unique subjects, then create column map. if self.merge_cols and n_samples > n_uniques: self.col_map = self._make_col_map() rx_str = self.gse.parameters['rx_gsm_subject_str'] Log.info(("Created column merge map for %s (%d samples to %d subjects)" +\ " with rx '%s'") % \ (self.gse, n_samples, n_uniques, rx_str)) # Verify that column merge map is reasonable (num uniques + 1 for ID column) if len(self.col_map) != n_uniques + 1: Log.warning("Column merge map has %d classes, expected %d in %s." % \ (len(self.col_map), n_uniques, self)) # No column merging scheme can exist. Do not create a col_map. else: # Retrieve the regular expression used rx_str = self.gse.parameters['rx_gsm_subject_str'] Log.info("No column merge map created for %s using rx '%s'. Merge_cols flag is %s" % \ (self.gse, rx_str, self.merge_cols))
def get_rows(self): """Return filtered row iterator. CLEAN THIS UP It may be best to break this into multiple filters? Fix to return [str] Returns: *[str] of filtered rows of data split by columns """ Log.info("Initiated filter %s for rows of %s" % (self, self.gse)) if self.col_map: Log.info("self.col_map exists. Merge %d to %d columns for %s" % \ (len(self.col_titles), len(self.col_map), self)) else: Log.info("No col_map. Will not merge %d columns for %s." % \ (len(self.col_titles), self)) # 0. Determine best gene name column in case GENE_SYMBOL does not exist. # ========== gene_symbol_name = None # Traverse column names in preferred order. for name in geo.GPL.EQTL_GENE_NAME_LIST: # Skip columns without assignments. Continue if self.gse.platform.special_cols[name] is None: continue # Choose the first column that has an acceptable assignment. Break. else: actual_column_name = self.gse.platform.special_cols[name] gene_symbol_name = name break # Verify that a column was chosen to identify the row. if gene_symbol_name: Log.info("Selected column '%s=>%s' to best represent gene name for %s." %\ (gene_symbol_name, actual_column_name, self.gse.platform)) else: raise MalformedFilterError, "Cannot select gene symbol column from %s" % \ (self.gse.platform) # 1. Update column titles accounting for merged columns. # ========== if self.col_map: self.col_titles = self._merge_cols(self.col_titles, merge_titles) # Insert generated column titles (AFTER merging columns) # self.col_titles[0] should always be "ID_REF" col_titles_prefix = ["ID_REF", gene_symbol_name, "NUM_VALUES", "MEAN", "STD"] self.col_titles = col_titles_prefix + self.col_titles[1:] Log.info("Added %s, NUM_VALUES, MEAN, STD to col titles for %s." %\ (gene_symbol_name, self)) # Open new temporary file. XXX RENAME filepath = temp_file_name("%s.rowmerge" % self.gse.id) fp_out = open(filepath, "w") # 2: @DATAPASS 1: Merge columns, add gene symbol, filter non-genes. # ========== Log.info(("Started filter 1 in %s for %s: find and add gene, merge cols. " + "(This may take a while.)") % (self, self.gse)) num_rows = 0 for row in self.gse.get_rows(): # TODO: Add status reporting to console num_rows += 1 # Determine gene symbol for this row. Filter if no gene symbol exists. row_id = row[0] # Row ID should always be the first entry in a row. gene_sym = self.gse.platform.get_column(row_id, gene_symbol_name) if not gene_sym: self.rows_filtered.append(row_id) continue # skip this row else: self.rows_per_gene.setdefault(gene_sym, set()).add(row_id) # Merge columns using column mapping of series matrix columns. # Also, transform row into "floats" and None if self.col_map: # XXX_merge_cols is slow, perhaps due to float conversions. row = self._merge_cols(row, merge_floats) else: row = map(get_float, row) # Compute mean and standard deviation of all non-ID columns # check for None specifically since a valid value could be 0 filtered_row = filter(lambda x: x is not None, row[1:]) std = calc_std(filtered_row) mean = calc_mean(filtered_row) num_values = len(filtered_row) # Store row statistics self.row_stats[row_id] = \ {'num_values': num_values, 'mean': mean, 'std': std} # Insert (gene_sym, size, mean, std) into second column row = [row_id , gene_sym, num_values, mean, std] + row[1:] # Write row to temporary file. # TODO: I may want to compress my row by converting it to a pickle. # pickling a list of floats uses 2/3 space and takes 1/2 compute time. fp_out.write("\t".join(map(str, row))) fp_out.write("\n") fp_out.close() # Log results of filter pass 1 # ========== n = len(self.rows_filtered) n_gene_rows = num_rows-n mean_rows_per_gene = float(num_rows-n)/len(self.rows_per_gene) if num_rows != self.gse.est_num_row: Log.warning("Num rows read(%d) not num rows expected(%d) for %s" % \ (num_rows, self.gse.est_num_row, self)) Log.info(("Filter 1 complete for %s. " + \ "%d of %d (%.2f%%) rows removed for no gene symbol. %d rows remain.") % \ (self, n, num_rows, (n/float(num_rows))*100, n_gene_rows)) Log.info("Number of unique genes: %d, %.1f mean num rows per gene." % \ (len(self.rows_per_gene), mean_rows_per_gene)) # 3: Choose representative genes from self.row_stats and self.rows_per_gene # ========== # select all rows for a gene. If a gene selected_row_ids = [] for gene, row_ids in self.rows_per_gene.items(): # If only a single row for this gene exists, choose it. if len(row_ids) == 1: best_row_id = row_ids.pop() # Else, choose row with the highest mean value. else: s = sorted(row_ids, key=lambda x: self.row_stats[x]['mean']) best_row_id = s[-1] # Add this row_id to the accepted list selected_row_ids.append(best_row_id) n_single_gene_rows = len(selected_row_ids) Log.info("Selected %d of %d rows for %d genes by maximum row mean." % \ (n_single_gene_rows, n_gene_rows, len(self.rows_per_gene))) # Sort row_ids by row standard deviation in decreasing order. selected_row_ids.sort(key=lambda x: self.row_stats[x]['std'], reverse=True) # Select top percentile by std. Convert type to set for easier membership tests. x = int(len(selected_row_ids)*self.percentile) selected_row_ids = set(selected_row_ids[:x]) threshold_num_rows = len(selected_row_ids) assert(x == threshold_num_rows) Log.info("Selected top %d%% of rows (%d of %d) by standard deviation." % (self.percentile*100, threshold_num_rows, n_single_gene_rows)) # FINAL PASS: YIELD FILTERED LINES # =========== # Open temporary file generated in first pass. fp = open(filepath, "r") # Yield (modified) column titles. yield self.col_titles[:] # For each line, only yield if the row_id is in the selected_row_ids list. num_yielded_rows = 0 for line in fp: row = line.strip().split("\t") row_id = row[0] if row_id in selected_row_ids: num_yielded_rows += 1 yield row # All lines yielded. Check number of lines yielded with expected value. if num_yielded_rows != threshold_num_rows: Log.warning("%d yielded rows != %d expected number of rows." % \ (num_yielded_rows, threshold_num_rows)) else: Log.info("Filter complete. yielded %d rows." % (num_yielded_rows))
def __iter__(self): """Call at start of iter read loops""" if self.completed or self.closed: Log.warning("Iterator opened on closed or completed %s" % self) return self