Python beforeの例

プログラミング言語: Python

名前空間/パッケージ名: cnvlib.rna

メソッド/関数: before

hotexamples.comのコード掲載数: 4

Python before - 4件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのcnvlib.rna.beforeの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

ファイル: import_rsem.py プロジェクト: zitsen/cnvkit

def aggregate_rsem(fnames):
    """Pull out the expected read counts from each RSEM file.

    The expected read counts are located in the fifth column of this version of
    RSEM output. They start on the second line (after a header line). I will
    also pull the ensembl gene id, which is located in the first column.

    Returns:
        sample_counts : DataFrame
            Row index is Ensembl gene ID, column index is filename.
        tx_lengths : Series
            Gene lengths.
    """
    prev_row_count = None
    sample_cols = {}
    length_cols = []
    length_colname = 'length'  # or: 'effective_length'
    for fname in fnames:
        # NB: read_table(index_col=_) works independently of combine=, dtype=
        #   so index column needs to be processed separately
        #   https://github.com/pandas-dev/pandas/issues/9435
        d = pd.read_table(
            fname,
            usecols=['gene_id', length_colname, 'expected_count'],
            #  index_col='gene_id',
            converters={
                'gene_id': rna.before('.')
            }).set_index('gene_id')
        if prev_row_count is None:
            prev_row_count = len(d)
        elif len(d) != prev_row_count:
            raise RuntimeError(
                "Number of rows in each input file is not equal")
        sample_id = rna.before(".")(os.path.basename(fname))
        sample_cols[sample_id] = d.expected_count.fillna(0)
        length_cols.append(d[length_colname])
    sample_counts = pd.DataFrame(sample_cols)
    tx_lengths = pd.Series(np.vstack(length_cols).mean(axis=0),
                           index=sample_counts.index)
    return sample_counts, tx_lengths

コード例 #2

ファイルを表示

ファイル: import_gene_counts.py プロジェクト: zitsen/cnvkit

def aggregate_gene_counts(filenames):
    prev_row_count = None
    sample_cols = {}
    for fname in filenames:
        d = (pd.read_table(fname,
                           header=None,
                           comment="_",
                           names=["gene_id", "expected_count"],
                           converters={
                               "gene_id": rna.before(".")
                           }).set_index("gene_id"))
        # .drop(["__no_feature", "__ambiguous", "__too_low_aQual",
        # "__not_aligned", "__alignment_not_unique"]))
        if prev_row_count is None:
            prev_row_count = len(d)
        elif len(d) != prev_row_count:
            raise RuntimeError(
                "Number of rows in each input file is not equal")
        sample_id = rna.before(".")(os.path.basename(fname))
        sample_cols[sample_id] = d.expected_count.fillna(0)
    sample_counts = pd.DataFrame(sample_cols)
    return sample_counts

コード例 #3

ファイルを表示

ファイル: cnv_expression_correlate.py プロジェクト: zitsen/cnvkit

def load_tcga_table(fname, shared_key):
    """Load TCGA expression/CNV data, keeping unique Entrez genes.

    Rows without an Entrez_Gene_Id value are dropped. Where a gene has multiple
    HUGO names but one Entrez_Gene_Id (i.e. multiple rows with the same
    Entrez_Gene_Id), only the sortest and then alphabetically first HUGO name is
    kept, ensuring Entrez_Gene_Id values are unique.
    """
    table = pd.read_table(fname, dtype={shared_key: str}, na_filter=False)
    table = table[table[shared_key] != ''].astype({shared_key: int})
    before_pipe = before('|')
    sort_order = (table['Hugo_Symbol'].apply(
        lambda x: (len(x), before_pipe(x))).argsort())
    table = (table.iloc[sort_order].drop_duplicates(
        subset=shared_key).set_index(shared_key).sort_index(axis=0).sort_index(
            axis=1))
    print("Loaded", fname, "with shape:", table.shape, file=sys.stderr)
    return table

コード例 #4

ファイルを表示

ファイル: cnv_expression_correlate.py プロジェクト: etal/cnvkit

def load_tcga_table(fname, shared_key):
    """Load TCGA expression/CNV data, keeping unique Entrez genes.

    Rows without an Entrez_Gene_Id value are dropped. Where a gene has multiple
    HUGO names but one Entrez_Gene_Id (i.e. multiple rows with the same
    Entrez_Gene_Id), only the sortest and then alphabetically first HUGO name is
    kept, ensuring Entrez_Gene_Id values are unique.
    """
    table = pd.read_csv(fname, sep='\t', dtype={shared_key: str}, na_filter=False)
    table = table[table[shared_key] != ''].astype({shared_key: int})
    before_pipe = before('|')
    sort_order = (table['Hugo_Symbol']
                  .apply(lambda x: (len(x), before_pipe(x)))
                  .argsort())
    table = (table.iloc[sort_order]
             .drop_duplicates(subset=shared_key)
             .set_index(shared_key)
             .sort_index(axis=0)
             .sort_index(axis=1))
    logging.info("Loaded %s with shape: %s", fname, table.shape)
    return table