def do_wheels(s, w=1): db = conn['test_genome'] d = len(db.kmers.find_one()['s']) set_wheels(d, realm='test_genome', spokes=s, wheels=w, out_path='/mnt/test_') W = get_wheels('/mnt/test_Wheels.txt') return W
def do_wheels(s,w=1): db = conn['test_genome'] d = len(db.kmers.find_one()['s']) set_wheels(d,realm='test_genome',spokes=s,wheels=w,out_path='/mnt/test_') W = get_wheels('/mnt/test_Wheels.txt') return W
# Latent Strain Analysis - Full Procedure # CREATING THE HASH from fastq_reader import rand_kmers_for_wheel rand_kmers_for_wheel('/mnt/grinder_output',35,2000) from hyper_sequences import set_wheels set_wheels(35,random_kmer_path='/mnt/grinder_output/random_kmers.fastq',wheels=1,out_path='/mnt/grinder_output/') # HASHING THE RAW READS # via MapReduce (large data sets) # test like: $ head -n100 somefile.fastq | python fastq_read_mapper.py | sort -k1 | python read_reducer.py # upload all fastq files to S3 bucket data directory (eg s3://cleary-metagenomics/data) # create an elastic map reduce job: # input dir: s3n://cleary-metagenomics/data # output dir: s3n://cleary-metagenomics/hashed_reads # mapper: s3n://cleary-metagenomics/job/fastq_read_mapper.py # reducer: s3n://cleary-metagenomics/job/read_reducer.py # extra args: -cacheFile s3n://cleary-metagenomics/cache_files/Wheels.txt#Wheels.txt # with 20m reads this took 2h18m running with 19 m1.large # download results: s3cmd get s3://cleary-metagenomics/hashed_reads/part* from merge_read_bins import merge_files merge_files('/mnt2/mr_out/','/mnt/grinder_output/',out_prefix='/mnt/grinder_output/hashed_reads/') # merging 34 parts of ~700Mb each took couple hours from read_partitioning import hash_counts_from_hashq for hf in hashed_read_files: hash_counts_from_hashq(28,hf,out_path='/mnt/grinder_output/'+hf[hf.rfind('/')+1:hf.index('.')]+'Kmer_Hash_Counts.txt') # on a single node (small data sets) # read_partitioning: create_kmer_hash_counts (should write hashq!) # CONDITION, SVD, AND CLUSTER KMER ABUNDANCE MATRIX from eigenhashes import *