Exemple #1
0
def aggregate_rsem(fnames):
    """Pull out the expected read counts from each RSEM file.

    The expected read counts are located in the fifth column of this version of
    RSEM output. They start on the second line (after a header line). I will
    also pull the ensembl gene id, which is located in the first column.

    Returns:
        sample_counts : DataFrame
            Row index is Ensembl gene ID, column index is filename.
        tx_lengths : Series
            Gene lengths.
    """
    prev_row_count = None
    sample_cols = {}
    length_cols = []
    length_colname = 'length'  # or: 'effective_length'
    for fname in fnames:
        # NB: read_table(index_col=_) works independently of combine=, dtype=
        #   so index column needs to be processed separately
        #   https://github.com/pandas-dev/pandas/issues/9435
        d = pd.read_table(
            fname,
            usecols=['gene_id', length_colname, 'expected_count'],
            #  index_col='gene_id',
            converters={
                'gene_id': rna.before('.')
            }).set_index('gene_id')
        if prev_row_count is None:
            prev_row_count = len(d)
        elif len(d) != prev_row_count:
            raise RuntimeError(
                "Number of rows in each input file is not equal")
        sample_id = rna.before(".")(os.path.basename(fname))
        sample_cols[sample_id] = d.expected_count.fillna(0)
        length_cols.append(d[length_colname])
    sample_counts = pd.DataFrame(sample_cols)
    tx_lengths = pd.Series(np.vstack(length_cols).mean(axis=0),
                           index=sample_counts.index)
    return sample_counts, tx_lengths
Exemple #2
0
def aggregate_gene_counts(filenames):
    prev_row_count = None
    sample_cols = {}
    for fname in filenames:
        d = (pd.read_table(fname,
                           header=None,
                           comment="_",
                           names=["gene_id", "expected_count"],
                           converters={
                               "gene_id": rna.before(".")
                           }).set_index("gene_id"))
        # .drop(["__no_feature", "__ambiguous", "__too_low_aQual",
        # "__not_aligned", "__alignment_not_unique"]))
        if prev_row_count is None:
            prev_row_count = len(d)
        elif len(d) != prev_row_count:
            raise RuntimeError(
                "Number of rows in each input file is not equal")
        sample_id = rna.before(".")(os.path.basename(fname))
        sample_cols[sample_id] = d.expected_count.fillna(0)
    sample_counts = pd.DataFrame(sample_cols)
    return sample_counts
def load_tcga_table(fname, shared_key):
    """Load TCGA expression/CNV data, keeping unique Entrez genes.

    Rows without an Entrez_Gene_Id value are dropped. Where a gene has multiple
    HUGO names but one Entrez_Gene_Id (i.e. multiple rows with the same
    Entrez_Gene_Id), only the sortest and then alphabetically first HUGO name is
    kept, ensuring Entrez_Gene_Id values are unique.
    """
    table = pd.read_table(fname, dtype={shared_key: str}, na_filter=False)
    table = table[table[shared_key] != ''].astype({shared_key: int})
    before_pipe = before('|')
    sort_order = (table['Hugo_Symbol'].apply(
        lambda x: (len(x), before_pipe(x))).argsort())
    table = (table.iloc[sort_order].drop_duplicates(
        subset=shared_key).set_index(shared_key).sort_index(axis=0).sort_index(
            axis=1))
    print("Loaded", fname, "with shape:", table.shape, file=sys.stderr)
    return table
def load_tcga_table(fname, shared_key):
    """Load TCGA expression/CNV data, keeping unique Entrez genes.

    Rows without an Entrez_Gene_Id value are dropped. Where a gene has multiple
    HUGO names but one Entrez_Gene_Id (i.e. multiple rows with the same
    Entrez_Gene_Id), only the sortest and then alphabetically first HUGO name is
    kept, ensuring Entrez_Gene_Id values are unique.
    """
    table = pd.read_csv(fname, sep='\t', dtype={shared_key: str}, na_filter=False)
    table = table[table[shared_key] != ''].astype({shared_key: int})
    before_pipe = before('|')
    sort_order = (table['Hugo_Symbol']
                  .apply(lambda x: (len(x), before_pipe(x)))
                  .argsort())
    table = (table.iloc[sort_order]
             .drop_duplicates(subset=shared_key)
             .set_index(shared_key)
             .sort_index(axis=0)
             .sort_index(axis=1))
    logging.info("Loaded %s with shape: %s", fname, table.shape)
    return table