コード例 #1
0
ファイル: annotation.py プロジェクト: muluayele999/OpenOmics
    def load_dataframe(self, file_resources):
        source_df = dd.read_table(file_resources["NONCODEv5_source"],
                                  header=None)
        source_df.columns = ["NONCODE Transcript ID", "name type", "Gene ID"]

        transcript2gene_df = dd.read_table(
            file_resources["NONCODEv5_Transcript2Gene"], header=None)
        transcript2gene_df.columns = [
            "NONCODE Transcript ID", "NONCODE Gene ID"
        ]

        self.noncode_func_df = dd.read_table(
            file_resources["NONCODEv5_human.func"], header=None)
        self.noncode_func_df.columns = ["NONCODE Gene ID", "GO terms"]
        self.noncode_func_df.set_index("NONCODE Gene ID", inplace=True)

        # Convert to NONCODE transcript ID for the functional annotation data
        self.noncode_func_df[
            "NONCODE Transcript ID"] = self.noncode_func_df.index.map(
                pd.Series(
                    transcript2gene_df['NONCODE Transcript ID'].values,
                    index=transcript2gene_df['NONCODE Gene ID']).to_dict())

        # Convert NONCODE transcript ID to gene names
        source_gene_names_df = source_df[source_df["name type"] ==
                                         "NAME"].copy()

        self.noncode_func_df["Gene Name"] = self.noncode_func_df[
            "NONCODE Transcript ID"].map(
                pd.Series(source_gene_names_df['Gene ID'].values,
                          index=source_gene_names_df['NONCODE Transcript ID']).
                to_dict())
コード例 #2
0
def write_probes(probes_summary_filename, probes_dir, n_workers):
    cluster = LocalCluster(n_workers=n_workers,
                           threads_per_worker=1,
                           memory_limit="16GB")
    client = Client(cluster)
    num_record = len([s for s in SeqIO.parse(input_fasta_filename, 'fasta')])
    pacbio_sequence_list = pd.DataFrame(index=np.arange(num_record),
                                        columns=['SEQID', 'SEQUENCE'])
    pacbio_sequence_list['SEQID'] = [
        s.id for s in SeqIO.parse(input_fasta_filename, 'fasta')
    ]
    pacbio_sequence_list['SEQUENCE'] = [
        s.seq for s in SeqIO.parse(input_fasta_filename, 'fasta')
    ]
    return_code_list = []
    for i in range(num_record):
        return_code = write_blast_pacbio_sequence(
            pacbio_sequence_list.iloc[i, :], temp_dir)
        return_code_list.append(return_code)
    return_code_total = dask.delayed(sum)(return_code_list)
    result = return_code_total.compute()
    probes_blast_results = dd.read_table('{}/*.blast.out'.format(temp_dir),
                                         delim_whitespace=True,
                                         header=None,
                                         dtype={13: str})
    probes_blast = probes_blast_results.compute()
    probes_blast_results_filename = os.path.basename(input_fasta_filename)
    probes_blast_results.to_csv('{}/{}.blast.out'.format(
        temp_dir, probes_blast_results_filename),
                                index=None,
                                header=None,
                                sep=' ')
    client.close()
    cluster.close()
    return (0)
コード例 #3
0
ファイル: annotation.py プロジェクト: arfon/OpenOmics
    def load_dataframe(self, file_resources, npartitions=None):
        """
        Args:
            file_resources:
            npartitions:
        """
        go_terms = pd.read_table(
            file_resources["rnacentral_rfam_annotations.tsv"],
            low_memory=True,
            header=None,
            names=["RNAcentral id", "GO terms", "Rfams"])
        go_terms["RNAcentral id"] = go_terms["RNAcentral id"].str.split(
            "_", expand=True, n=2)[0]

        gene_ids = []
        for file in file_resources:
            if "database_mappings" in file:
                if npartitions:
                    id_mapping = dd.read_table(file_resources[file],
                                               header=None,
                                               names=[
                                                   "RNAcentral id", "database",
                                                   "external id", "species",
                                                   "RNA type", "gene symbol"
                                               ])
                else:
                    id_mapping = pd.read_table(file_resources[file],
                                               low_memory=True,
                                               header=None,
                                               names=[
                                                   "RNAcentral id", "database",
                                                   "external id", "species",
                                                   "RNA type", "gene symbol"
                                               ])

                gene_ids.append(id_mapping)

        if npartitions:
            gene_ids = dd.concat(gene_ids, join="inner")
        else:
            gene_ids = pd.concat(gene_ids, join="inner")

        gene_ids["species"] = gene_ids["species"].astype("O")
        if self.species is not None:
            gene_ids = gene_ids[gene_ids["species"] == self.species]

        lnc_go_terms = go_terms[go_terms["RNAcentral id"].isin(
            gene_ids["RNAcentral id"])].groupby("RNAcentral id")[
                "GO terms"].apply(lambda x: "|".join(x.unique()))
        lnc_rfams = go_terms[go_terms["RNAcentral id"].isin(
            gene_ids["RNAcentral id"])].groupby(
                "RNAcentral id")["Rfams"].apply(lambda x: "|".join(x.unique()))

        gene_ids["GO terms"] = gene_ids["RNAcentral id"].map(lnc_go_terms)
        gene_ids["Rfams"] = gene_ids["RNAcentral id"].map(lnc_rfams)
        gene_ids = gene_ids[gene_ids["GO terms"].notnull()
                            | gene_ids["Rfams"].notnull()]

        return gene_ids
コード例 #4
0
ファイル: read_gtf.py プロジェクト: arfon/OpenOmics
def parse_gtf_dask(filepath_or_buffer, npartitions=None, compression=None, features=None):
    """
    Args:
        filepath_or_buffer (str or buffer object):
        npartitions (int): Number of partitions for the dask dataframe. Default None.
        compression (str): Compression type to be passed into dask.dataframe.read_table(). Default None.
        features (set or None): Drop entries which aren't one of these features
    """
    if features is not None:
        features = set(features)

    def parse_frame(s):
        if s == ".":
            return 0
        else:
            return int(s)

    # GTF columns:
    # 1) seqname: str ("1", "X", "chrX", etc...)
    # 2) source : str
    #      Different versions of GTF use second column as of:
    #      (a) gene biotype
    #      (b) transcript biotype
    #      (c) the annotation source
    #      See: https://www.biostars.org/p/120306/#120321
    # 3) feature : str ("gene", "transcript", &c)
    # 4) start : int
    # 5) end : int
    # 6) score : float or "."
    # 7) strand : "+", "-", or "."
    # 8) frame : 0, 1, 2 or "."
    # 9) attribute : key-value pairs separated by semicolons
    # (see more complete description in docstring at top of file)

    # Uses Dask
    logging.debug("dask.datafame.read_table, file={}, compression={}".format(filepath_or_buffer, compression))
    dataframe = dd.read_table(
        filepath_or_buffer,
        sep="\t",
        compression=compression,
        blocksize=None,
        comment="#",
        names=REQUIRED_COLUMNS,
        skipinitialspace=True,
        skip_blank_lines=True,
        error_bad_lines=True,
        warn_bad_lines=True,
        # chunksize=chunksize,
        engine="c",
        dtype={
            "start": np.int64,
            "end": np.int64,
            "score": np.float32,
            "seqname": str,
        },
        na_values=".",
        converters={"frame": parse_frame})

    return dataframe
コード例 #5
0
def handle_file(filepath: str):
    df = dd.read_table(filepath, sep='\x01')

    print(f'Usual length: {usual_length}')
    print(f'Num lines: {num_lines}')
    print(f'Num tweet ids: {len(tweet_id_set)}')
    print(f'Num engaging users: {len(engaging_user_id_set)}')
    print(f'Num engaged users: {len(engaged_with_user_id_set)}')
コード例 #6
0
ファイル: annotation.py プロジェクト: arfon/OpenOmics
    def load_dataframe(self, file_resources, npartitions=None):
        """
        Args:
            file_resources:
            npartitions:
        """
        if npartitions:
            df = dd.read_table(file_resources["proteinatlas.tsv"])
        else:
            df = pd.read_table(file_resources["proteinatlas.tsv"])

        return df
コード例 #7
0
def read_data_with_cond(data_file_str,reduce_memory=False,cond_and_str=None,output_path_pre=None,sep='\t'): 
    import re     
    print('----------------begin------------------')    
    try:
        if sep=='\t':
            data = dd.read_table(data_file_str,low_memory=False,dtype={'uid': 'object'}).compute()
        if sep==',':
            data = dd.read_csv(data_file_str,low_memory=False,dtype={'uid': 'object'}).compute()  
    except:
        if sep=='\t':
            data = pd.read_table(data_file_str,low_memory=False,dtype={'uid': 'object'})
        if sep==',':
            data = pd.read_csv(data_file_str,low_memory=False,dtype={'uid': 'object'})     
    print('--initial')
    print(data.info())    
    if reduce_memory:
        print('--reduce_memory')
        data = reduce_data_memory(data)
        print(data.info())

    if cond_and_str:
        print('--cond')
        cnt=1
        for cond in cond_and_str.split(','):
            pattern = re.compile(r'^.*>=.*$')
            if pattern.match(cond):
                f,n = cond.split('>=')[0],int(cond.split('>=')[1])
                data = data[data[f]>=n]
                print('shape of data after cond',cnt,':',data.shape)
            pattern = re.compile(r'^.*==.*$')
            if pattern.match(cond):
                f,n = cond.split('==')[0],int(cond.split('==')[1])
                data = data[data[f]==n]
                print('shape of data after cond',cnt,':',data.shape)                
            pattern = re.compile(r'^.*isnull.*$')
            if pattern.match(cond):
                f = cond.split('.')[0]
                data = data[data[f].isnull()]
                print('shape of data after cond',cnt,':',data.shape)
            cnt+=1
        print(data.info())
    print('------------conclusion---------------')
    print('shape of dataset:',data.shape)  
    print('-------------outputs------------------')
    if output_path_pre:    
        columns = pd.DataFrame(data.dtypes)
        columns = columns.reset_index()
        columns.columns = ['feature_name','dtypes']
        columns.to_csv(output_path_pre+'columns.csv',index=False,header=True)
        print('column names and dtypes have been downloaded to ',output_path_pre+'columns.csv')        
    return data
コード例 #8
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('table_txt_gz')
    parser.add_argument('--size_cutoff', default=3.8e9)

    args = parser.parse_args()

    # 50 MB block size
    df = dd.read_table(args.table_txt_gz, blocksize=50000000,
                       sep='\t', header=None, encoding='latin-1')

    base, ext = os.path.splitext(args.table_txt_gz)

    df.to_csv(base + '_part*.txt', header=False, index=False, sep='\t')
    return
コード例 #9
0
def load_file_in_staging(file_path, table_name):
    """Load content of file inside *_stg tables."""
    tmp_df = dd.read_table(file_path,
                           header=0,
                           sep=';',
                           dtype='str',
                           encoding='latin-1')
    logger.info('Uploading {} into {}.'.format(file_path, table_name))
    conn = f'sqlite:///{db_staging_file}'
    tmp_df.to_sql(table_name,
                  conn,
                  index=False,
                  if_exists='replace',
                  chunksize=200000)
    logger.info('Imported {} rows.'.format(tmp_df.shape[0]))
    del tmp_df
    return
コード例 #10
0
def get_gc(label_file, genome_file):
    """Calculates gc content for all viable entries in an input dataframe.

    Arguments:
        label_file {dataframe} -- [Dataframe containing genomic regions labeled
                                   as positive(1) or negative(0)]
        genome_file {str} -- [Path to a refrence genome in FASTA format]

    Returns:
        [dataframe] -- [Dataframe containing viable entries and their respective gc content]
    """
    bed_df = pybedtools.BedTool(label_file)
    bed_gc = bed_df.nuc(genome_file)
    gc_df = dd.read_table(bed_gc.fn)
    gc_df = gc_df.loc[:, gc_df.columns.str.contains("usercol|gc|num_N")]
    colnames = generate_colnames(gc_df)
    gc_df.columns = colnames
    gc_df = gc_df.loc[gc_df.num_N == 0].drop("num_N", axis=1)
    gc_df["gc"] = gc_df["gc"].astype("float32") * 100
    return gc_df
コード例 #11
0
ファイル: functions.py プロジェクト: LuisFalva/ophilea
    def dask_read(option, file_path):

        # Python map for file type pattern
        file_type = {
            'parquet': file_path + '/*.parquet',
            'csv': file_path + '/*.csv',
            'json': file_path + '/*.json',
            'text': file_path + '/*.txt'
        }

        # Define reader type by pattern mapping
        file_pattern = file_type[option]
        dask_reader = {
            'parquet': dask_df.read_parquet(file_pattern, engine='pyarrow'),
            'csv': dask_df.read_csv(file_pattern),
            'json': dask_df.read_json(file_pattern),
            'text': dask_df.read_table(file_pattern)
        }

        return dask_reader[option]
コード例 #12
0
ファイル: parse_blast.py プロジェクト: wckdouglas/cfNA
def read_sample(SAMPLE_FOLDER):
    '''
    read blast output
    '''

    col_names = [
        'qseqid', 'qlen', 'sseqid', 'slen', 'pident', 'length', 'mismatch',
        'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue'
    ]
    tRF_tab = SAMPLE_FOLDER + '/blast.tRF.tsv'
    samplename = os.path.basename(SAMPLE_FOLDER)
    print('Reading %s' % samplename)
    tRF_df = dd.read_table(tRF_tab, names = col_names, )\
        .repartition(npartitions=THREADS)  \
        .query('slen == qlen ') \
        .groupby('qseqid')\
        .apply(lambda d: d.nlargest(1, 'pident'))\
        .compute(workers=THREADS, scheduler='threads')\
        .reset_index(drop=True) \
        .assign(samplename = samplename)
    return tRF_df
コード例 #13
0
ファイル: data_frame.py プロジェクト: mikaylaedwards/dagster
def dataframe_loader(_context, config):
    file_type, file_options = list(config.items())[0]
    path = file_options.get("path")

    if file_type == "csv":
        return dd.read_csv(path, **dict_without_keys(file_options, "path"))
    elif file_type == "parquet":
        return dd.read_parquet(path, **dict_without_keys(file_options, "path"))
    elif file_type == "hdf":
        return dd.read_hdf(path, **dict_without_keys(file_options, "path"))
    elif file_type == "json":
        return dd.read_json(path, **dict_without_keys(file_options, "path"))
    elif file_type == "sql_table":
        return dd.read_sql_table(**file_options)
    elif file_type == "table":
        return dd.read_table(path, **dict_without_keys(file_options, "path"))
    elif file_type == "fwf":
        return dd.read_fwf(path, **dict_without_keys(file_options, "path"))
    elif file_type == "orc":
        return dd.read_orc(path, **dict_without_keys(file_options, "path"))
    else:
        raise DagsterInvariantViolationError(
            "Unsupported file_type {file_type}".format(file_type=file_type))
コード例 #14
0
def dataframe_loader(_context, config):
    file_type, file_options = list(config.items())[0]
    path = file_options.get('path')

    if file_type == 'csv':
        return dd.read_csv(path, **dict_without_keys(file_options, 'path'))
    elif file_type == 'parquet':
        return dd.read_parquet(path, **dict_without_keys(file_options, 'path'))
    elif file_type == 'hdf':
        return dd.read_hdf(path, **dict_without_keys(file_options, 'path'))
    elif file_type == 'json':
        return dd.read_json(path, **dict_without_keys(file_options, 'path'))
    elif file_type == 'sql_table':
        return dd.read_sql_table(**file_options)
    elif file_type == 'table':
        return dd.read_table(path, **dict_without_keys(file_options, 'path'))
    elif file_type == 'fwf':
        return dd.read_fwf(path, **dict_without_keys(file_options, 'path'))
    elif file_type == 'orc':
        return dd.read_orc(path, **dict_without_keys(file_options, 'path'))
    else:
        raise DagsterInvariantViolationError(
            'Unsupported file_type {file_type}'.format(file_type=file_type))
コード例 #15
0
ファイル: process_data.py プロジェクト: bfeif/RhymeFinder
client = Client(processes=False,
                threads_per_worker=4,
                n_workers=4,
                memory_limit='2GB')


# cmu list split helper function
def cmu_list_split_helper(x, rhyme_length=3):
    splitted = x.split()
    return (splitted[0], splitted[-rhyme_length:])


# glove stuff
glove_data_file = "../data/glove/glove.6B.100d_1000lines.txt"
gloves = dd.read_table(glove_data_file,
                       sep=" ",
                       header=None,
                       quoting=csv.QUOTE_NONE)
gloves = gloves.set_index(0)
print(gloves.compute().head())

# cmu stuff
cmu_data_file = "../data/cmu/cmudict_1000lines.dict"
phone_seqs = db.read_text(cmu_data_file)
phone_seqs = phone_seqs.map(cmu_list_split_helper)
phone_seqs = phone_seqs.to_dataframe()
phone_seqs = phone_seqs.set_index(0)


# preprocess gloves and cmus together
def preprocess_df(df):
    return df
コード例 #16
0
    target, context = int(target), int(context)
    if (target, context) in count_dict:
        count_dict[(target, context)] += 1
    else:
        count_dict[(target, context)] = 1

# dict chunk by chunk

from dask.distributed import Client
client = Client(n_workers=5,
                threads_per_worker=2,
                processes=False,
                memory_limit='2GB')
import dask.dataframe as dd

pair_data = dd.read_table(
    '/home/srawat/Documents/UMBC+Wiki/dsm_files/tuples.txt')
pair_data.rename(columns={2822: 'target', 80: 'context'})

# Tuple to dict
import pickle
from tqdm import tqdm
count_dict = dict()
with open('/home/srawat/Documents/UMBC+Wiki/dsm_files/tuples.txt', 'r') as g:
    for line in tqdm(g):
        target, context = line.split('\t')
        target = int(target)
        context = int(context)
        if (target, context) in count_dict:
            count_dict[(target, context)] += 1
        else:
            count_dict[(target, context)] = 1
コード例 #17
0
    else:
        return remove_duplicated_white_space(raw_line[16:]).split(" ")[1].replace(":","")

def data_raw_parser(*arg):
    print(*arg)
    if  len(arg) == 2:
        return logtype_parser(arg[0],arg[1])
    elif len(arg) == 3:
        return tm_parser(arg[0],arg[1],arg[2])
    elif len(arg)>3:
        return env_parser(arg)

row_index=['0','1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21']
parse_dates={'logtype':[6,17], 'tm':[0,1,6], 'env':[10,11,12,13,14,15,16,17,18,19,20,21]}
s=time.time()
df = dd.read_table("/data/datalake/fitslake/datalab_backup/vltlogs/raw_logs/2016/10/wcnnaco.2016-10-01.log", delim_whitespace=True, names=row_index, encoding='latin-1', skiprows=0)
df.compute()
#logtype = df.apply(logtype_parser,axis=1)
#tm = df.apply(tm_parser,axis=1)
#env = df.apply(env_parser, axis=1)
#logtype.compute()
#print(df)
#logtype.to_csv("/data/datalake/fitslake/datalab_backup/vltlogs/raw_logs/2016/10/prueba-*.csv")
print(time.time()-s)
#print(logtype.compute()[0:5])
#print(tm.compute()[0:5])
#print(env.compute()[10:20])
#print(df.compute()['6'])
#print(time.time()-s)

コード例 #18
0
def run_pipeline(task_type: str) -> bool:
    map: Dict = task_type_map[task_type]
    in_bucket: str = map['in']
    out_bucket: str = map['out']
    cols: Dict[str, str] = map['cols']
    converters: Dict[str, Callable] = map['converters']
    dtypes: Dict[str, str] = map['dtypes']
    index_col: str = map['index']['col']
    sorted: bool = map['index']['sorted']
    row_op: Callable = map['row_op']
    diff: Dict = map['diff']
    filter_by_key: str = resample_map['filter_by']['key']
    filter_by_val: int = resample_map['filter_by']['value']
    resample_freq: str = resample_map['freq']
    aggr_func: Callable = map['aggr_func']

    try:

        #client = Client(address='dscheduler:8786')

        s3_in_url: str = 's3://'+in_bucket+'/*.*'
        s3_options: Dict = ps.fetch_s3_options()
        #df = dd.read_table(path=s3_in_url, storage_options=s3_options)
        df = dd.read_table(urlpath='tmp/'+in_bucket+'/*.*',
                           header=0,
                           usecols=lambda x: x.upper() in list(cols.keys()),
                           skipinitialspace=True,
                           converters=converters
                           )

        # rename columns
        df = df.rename(columns=cols)
        df.compute()

        if sorted:
            df = df.map_partitions(lambda pdf: pdf.rename(columns=cols)
                                   .apply(func=row_op, axis=1),
                                   meta=dtypes).compute()
        else:
            df = df.map_partitions(lambda pdf: pdf.rename(columns=cols)
                                   .set_index(index_col).sort().reset_index()
                                   .apply(func=row_op, axis=1),
                                   meta=dtypes).compute()



        # map row-wise operations
        #df = df.map_partitions(lambda pdf: pdf.apply(func=row_op, axis=1), meta=dtypes)

        # diff
        if diff['compute']:
            df[diff['new_col']] = df[diff['col']].diff()

        # specific processing for transit
        if task_type == 'cl-transit':
            df = df.map_partitions(partial(remove_outliers, col='DELEXITS'), meta=dtypes)

        # drop na values
        df = df.dropna()

        # set index (assumes pre-sorted data)
        df = df.set_index(index_col, sorted=True)

        #df.compute()

        # filter
        if filter_by_key == 'weekday':
            df = df.loc[df[index_col].weekday() == filter_by_val]

        # resample using frequency and aggregate function specified
        df = compose(df.resample(resample_freq), aggr_func)

        # save in out bucket
        s3_out_url: str = 's3://' + out_bucket
        # dd.to_parquet(df=df, path=s3_out_url, storage_options=s3_options)
        dd.to_parquet(df=df, path='tmp/'+out_bucket+'/*.*')

    except Exception as err:
        print('error in run_pipeline %s' % str(err))
        raise err

    return True
コード例 #19
0
ファイル: E10.py プロジェクト: CamilaDuitama/RascovanProject
    colnames.drop(columns="to_drop", inplace=True)

    new_accession_list = list(
        colnames[-colnames["Run_accession"].isin(polish_samples)]
        ["Run_accession"])
    leave_out = new_accession_list
    client = Client()

    for p in partitions:

        print("Partition " + p + " started being analyzed in client:")
        print(client)

        begin_time = time.time()
        #Read partition
        partition=dd.read_table("/pasteur/sonic/scratch/public/cduitama/RascovanProject/kmMatrices/combination/large_dataset_no_polish/matrices/matrix_"\
                                +p+".txt",header=None,sep=" ",names=["Kmer"]+list(colnames["Run_accession"]))

        #Drop K-mer column
        partition_array = partition.drop(["Kmer"], axis=1)
        #convert from dask array to np array
        partition_array = partition_array.values

        #Binarization
        transformer = Binarizer().fit(partition_array)
        # fit does nothing.
        partition_b = transformer.transform(partition_array)

        print("Finished reading and binarizing partition " + str(p))

        for sink in leave_out:
コード例 #20
0
# define arguments
parser = argparse.ArgumentParser(
    description='sum the number of variants per gene in an individual')
parser.add_argument('-v', '--variants', dest='variants', help='variant table')
parser.add_argument('-g', '--genes', dest='genes', help='genes of interest')
parser.add_argument('-o', '--out', dest='out', help='output file name')
args = parser.parse_args()

# if this isn't working for you, see https://distributed.dask.org/en/latest/setup.html
# or talk to your friendly IT professional
client = Client()
client.restart()

# read in table of variants
variants = ddf.read_table(args.variants, blocksize=50e6)  # 50 MB blocks
#print variants.head()
print 'variants read in'

# define list of genes of interest
with open(args.genes) as g:
    genes_of_interest = g.read().splitlines()

# make list of EUR proband IDs
# I know nothing about this size, but consider passing in an index, which will make operations later on faster
# if it makes sense for what your trying to do
# e.g. I imagine indexing by gene might be useful
# see http://docs.dask.org/en/latest/dataframe-performance.html#use-the-index
master = ddf.read_table(
    "/scratch/ucgd/lustre/work/u0806040/data/15_Jan_19_Simons_master_ancestry_corrected_PRS.txt",
    blocksize=
コード例 #21
0
ファイル: E8.py プロジェクト: CamilaDuitama/RascovanProject
    #Load colnames
    colnames=pd.read_csv("kmMatrices/combination/large_dataset/combination_fof.txt", sep=" : ",\
                         header=None,names=["Run_accession","to_drop"],engine="python")
    colnames.drop(columns="to_drop",inplace=True)

    leave_out=list(colnames["Run_accession"])
    
    for p in partitions:

        print("Partition "+ p +" started being analyzed in client:")
        print(client)
        
        begin_time = datetime.datetime.now()
        #Read partition
        partition=dd.read_table("kmMatrices/combination/large_dataset/storage/matrix/partition_"+p+"/ascii_matrix"+p+".mat",\
                                  skiprows=8,header=None,sep=" ",names=["Kmer"]+list(colnames["Run_accession"]))

        #Drop K-mer column
        partition_array=partition.drop("Kmer",axis=1)
        #convert from dask array to np array
        partition_array=partition_array.values

        #Binarization
        transformer = Binarizer().fit(partition_array); # fit does nothing.
        partition_b=transformer.transform(partition_array);

        print("Finished reading and binarizing partition "+ str(p))

        for i in leave_out: 

            sinks=[i]
コード例 #22
0
import glob
import numpy as np
import pandas as pd
import os
import dask.dataframe as dd

repeats = dd.read_csv("repeats_hg19.csv")

anno = dd.read_table("RRBS_NormalBCD19pCD27pcell1_22_TAGGCATG.CATGAC.dan.anno")

df1 = dd.merge(anno, repeats, on="chr", how="outer", suffixes=("","_repeat"))
df1.to_csv("find_repeatsTESToutput.csv", index=False)
df1 = df1[(repeats.chr == row.chr) & (anno.start >= repeats.begin) & (anno.start <= repeats.end)]
df1 = dd.merge(anno, df1, on = ["chr"])
df1.to_csv("find_repeatsTEST2.csv", index=False).compute(num_workers=20)
コード例 #23
0
ファイル: read_gtf.py プロジェクト: gawbul/OpenOmics
def parse_gtf(filepath_or_buffer,
              npartitions=None,
              chunksize=1024 * 1024,
              features=None,
              intern_columns=["seqname", "source", "strand", "frame"],
              fix_quotes_columns=["attribute"]):
    """
    Args:
        filepath_or_buffer (str or buffer object):
        npartitions:
        chunksize (int):
        features (set or None): Drop entries which aren't one of these features
        intern_columns (list): These columns are short strings which should be
            interned
        fix_quotes_columns (list): Most commonly the 'attribute' column which
            had broken quotes on some Ensembl release GTF files.
    """
    if features is not None:
        features = set(features)

    dataframes = []

    def parse_frame(s):
        if s == ".":
            return 0
        else:
            return int(s)

    # GTF columns:
    # 1) seqname: str ("1", "X", "chrX", etc...)
    # 2) source : str
    #      Different versions of GTF use second column as of:
    #      (a) gene biotype
    #      (b) transcript biotype
    #      (c) the annotation source
    #      See: https://www.biostars.org/p/120306/#120321
    # 3) feature : str ("gene", "transcript", &c)
    # 4) start : int
    # 5) end : int
    # 6) score : float or "."
    # 7) strand : "+", "-", or "."
    # 8) frame : 0, 1, 2 or "."
    # 9) attribute : key-value pairs separated by semicolons
    # (see more complete description in docstring at top of file)
    if npartitions:
        logging.info(filepath_or_buffer)
        chunk_iterator = dd.read_table(
            filepath_or_buffer,
            sep="\t",
            comment="#",
            names=REQUIRED_COLUMNS,
            skipinitialspace=True,
            skip_blank_lines=True,
            error_bad_lines=True,
            warn_bad_lines=True,
            # chunksize=chunksize,
            engine="c",
            dtype={
                "start": np.int64,
                "end": np.int64,
                "score": np.float32,
                "seqname": str,
            },
            na_values=".",
            converters={"frame": parse_frame})
    else:
        chunk_iterator = pd.read_csv(filepath_or_buffer,
                                     sep="\t",
                                     comment="#",
                                     names=REQUIRED_COLUMNS,
                                     skipinitialspace=True,
                                     skip_blank_lines=True,
                                     error_bad_lines=True,
                                     warn_bad_lines=True,
                                     chunksize=chunksize,
                                     engine="c",
                                     dtype={
                                         "start": np.int64,
                                         "end": np.int64,
                                         "score": np.float32,
                                         "seqname": str,
                                     },
                                     na_values=".",
                                     converters={"frame": parse_frame})
    dataframes = []
    try:
        for df in chunk_iterator:
            for intern_column in intern_columns:
                df[intern_column] = [intern(str(s)) for s in df[intern_column]]

            # compare feature strings after interning
            if features is not None:
                df = df[df["feature"].isin(features)]

            for fix_quotes_column in fix_quotes_columns:
                # Catch mistaken semicolons by replacing "xyz;" with "xyz"
                # Required to do this since the Ensembl GTF for Ensembl
                # release 78 has mistakes such as:
                #   gene_name = "PRAMEF6;" transcript_name = "PRAMEF6;-201"
                df[fix_quotes_column] = [
                    s.replace(';\"', '\"').replace(";-", "-")
                    for s in df[fix_quotes_column]
                ]
            dataframes.append(df)
    except Exception as e:
        raise Exception("ParsingError:" + str(e))

    if npartitions:
        df = dd.concat(dataframes)
    else:
        df = pd.concat(dataframes)

    return df
コード例 #24
0
parser.add_argument('-o', '--out', dest = 'out', help = 'output file name')
args = parser.parse_args()

# define list of genes of interest
with open(args.genes) as g:
	genes_of_interest = g.read().splitlines()

# make list of EUR proband IDs
master = pandas.read_table("/scratch/ucgd/lustre/work/u0806040/data/15_Jan_19_Simons_master_ancestry_corrected_PRS.txt", dtype={'other_dx_axis_i': 'object', 'other_dx_axis_ii': 'object', 'other_dx_icd': 'object'})
probands = master.loc[master['family_member'] == 'p1']
eur_probands = probands.loc[probands['ancestry.prediction'] == 'EUR']
proband_ids = eur_probands['IID']

# read in table of variants
print('reading in variants')
variants = ddf.read_table(args.variants)
#print variants.head()

# filter variants
print('setting up variant filters')
# medium and and high impact
variants1 = variants[variants.impact.isin(['MED', 'HIGH'])]

# in gens of interest
variants2 = variants1[variants1.gene.isin(genes_of_interest)]

# convert back to pandas now that the data frame is small
print('computing and returing pandas data frame')
voi = variants2.compute()

# reorganize data frame so that rows are genes of interest, columns are IIDs and value are coutns of variants
コード例 #25
0
ファイル: OM.py プロジェクト: CamilaDuitama/RascovanProject
    sources = list(colnames["Run_accession"])
    classes = sorted(list(set(metadata["True_label"])))

    #Sort metadata according to column order in matrix DataFrame
    sorted_metadata = pd.DataFrame(columns=metadata.columns)
    for j in sources:
        sorted_metadata = pd.concat(
            [sorted_metadata, metadata[metadata["Run_accession"] == j]])
    sorted_metadata.reset_index(drop=True, inplace=True)

    #Build result dataframe
    result = pd.DataFrame(columns=classes +
                          ["Unknown", "Running time", "Sink"])

    #Load k-mer matrix of sources as dataframe
    partition=dd.read_table("matrix_100.pa.txt",header=None,sep=" ",\
                            names=["Kmer"]+list(colnames["Run_accession"]))

    #Drop K-mer column
    partition_array = partition.drop(["Kmer"], axis=1)

    #Define M' matrix of sources
    M_prime = partition_array.values
    M_prime.compute_chunk_sizes()
    #M_prime=M_prime.persist()

    print("Chunk sizes for the M_prime matrix were computed")

    #Create new vector for sink
    s_t = dd.read_table(path_sink, header=None, names=["pa"])
    s_t = s_t["pa"]
    s_t = s_t.values