def aggregate_rsem(fnames): """Pull out the expected read counts from each RSEM file. The expected read counts are located in the fifth column of this version of RSEM output. They start on the second line (after a header line). I will also pull the ensembl gene id, which is located in the first column. Returns: sample_counts : DataFrame Row index is Ensembl gene ID, column index is filename. tx_lengths : Series Gene lengths. """ prev_row_count = None sample_cols = {} length_cols = [] length_colname = 'length' # or: 'effective_length' for fname in fnames: # NB: read_table(index_col=_) works independently of combine=, dtype= # so index column needs to be processed separately # https://github.com/pandas-dev/pandas/issues/9435 d = pd.read_table( fname, usecols=['gene_id', length_colname, 'expected_count'], # index_col='gene_id', converters={ 'gene_id': rna.before('.') }).set_index('gene_id') if prev_row_count is None: prev_row_count = len(d) elif len(d) != prev_row_count: raise RuntimeError( "Number of rows in each input file is not equal") sample_id = rna.before(".")(os.path.basename(fname)) sample_cols[sample_id] = d.expected_count.fillna(0) length_cols.append(d[length_colname]) sample_counts = pd.DataFrame(sample_cols) tx_lengths = pd.Series(np.vstack(length_cols).mean(axis=0), index=sample_counts.index) return sample_counts, tx_lengths
def aggregate_gene_counts(filenames): prev_row_count = None sample_cols = {} for fname in filenames: d = (pd.read_table(fname, header=None, comment="_", names=["gene_id", "expected_count"], converters={ "gene_id": rna.before(".") }).set_index("gene_id")) # .drop(["__no_feature", "__ambiguous", "__too_low_aQual", # "__not_aligned", "__alignment_not_unique"])) if prev_row_count is None: prev_row_count = len(d) elif len(d) != prev_row_count: raise RuntimeError( "Number of rows in each input file is not equal") sample_id = rna.before(".")(os.path.basename(fname)) sample_cols[sample_id] = d.expected_count.fillna(0) sample_counts = pd.DataFrame(sample_cols) return sample_counts
def load_tcga_table(fname, shared_key): """Load TCGA expression/CNV data, keeping unique Entrez genes. Rows without an Entrez_Gene_Id value are dropped. Where a gene has multiple HUGO names but one Entrez_Gene_Id (i.e. multiple rows with the same Entrez_Gene_Id), only the sortest and then alphabetically first HUGO name is kept, ensuring Entrez_Gene_Id values are unique. """ table = pd.read_table(fname, dtype={shared_key: str}, na_filter=False) table = table[table[shared_key] != ''].astype({shared_key: int}) before_pipe = before('|') sort_order = (table['Hugo_Symbol'].apply( lambda x: (len(x), before_pipe(x))).argsort()) table = (table.iloc[sort_order].drop_duplicates( subset=shared_key).set_index(shared_key).sort_index(axis=0).sort_index( axis=1)) print("Loaded", fname, "with shape:", table.shape, file=sys.stderr) return table
def load_tcga_table(fname, shared_key): """Load TCGA expression/CNV data, keeping unique Entrez genes. Rows without an Entrez_Gene_Id value are dropped. Where a gene has multiple HUGO names but one Entrez_Gene_Id (i.e. multiple rows with the same Entrez_Gene_Id), only the sortest and then alphabetically first HUGO name is kept, ensuring Entrez_Gene_Id values are unique. """ table = pd.read_csv(fname, sep='\t', dtype={shared_key: str}, na_filter=False) table = table[table[shared_key] != ''].astype({shared_key: int}) before_pipe = before('|') sort_order = (table['Hugo_Symbol'] .apply(lambda x: (len(x), before_pipe(x))) .argsort()) table = (table.iloc[sort_order] .drop_duplicates(subset=shared_key) .set_index(shared_key) .sort_index(axis=0) .sort_index(axis=1)) logging.info("Loaded %s with shape: %s", fname, table.shape) return table