try: opts, args = getopt.getopt(sys.argv[1:],'hr:i:o:H:',["filerank=","inputdir=","outputdir=","headers"]) except: print help_message sys.exit(2) for opt, arg in opts: if opt in ('-h','--help'): print help_message sys.exit() elif opt in ('-r',"--filerank"): fr = int(arg)-1 elif opt in ('-i','--inputdir'): inputdir = arg if inputdir[-1] != '/': inputdir += '/' elif opt in ('-o','--outputdir'): outputdir = arg if outputdir[-1] != '/': outputdir += '/' elif opt in ('-H',"--headers"): headerdir = arg if headerdir[-1] != '/': headerdir += '/' hashobject = Fastq_Reader(inputdir,outputdir) FP = glob.glob(os.path.join(hashobject.input_path,'*.*')) FP = list(set([fp[fp.rfind('/')+1:fp.rfind('.')] for fp in FP])) file_group = FP[fr] FP = glob.glob(os.path.join(headerdir,'*.hashq.*')) originating_header = FP[int(file_group)] originating_header = originating_header[originating_header.rfind('/')+1:originating_header.index('.hashq')] hashobject.fastq_from_intermediate_output(file_group,originating_header)
try: opts, args = getopt.getopt(sys.argv[1:],'hr:i:o:',["filerank=","inputdir=","outputdir="]) except: print help_message sys.exit(2) for opt, arg in opts: if opt in ('-h','--help'): print help_message sys.exit() elif opt in ('-r',"--filerank"): fr = int(arg)-1 elif opt in ('-i','--inputdir'): inputdir = arg elif opt in ('-o','--outputdir'): outputdir = arg hashobject = Fastq_Reader(inputdir,outputdir) fr = str(fr) + '/' FP = glob.glob(os.path.join(inputdir+fr,'*.fastq')) FP = [fp for fp in FP if (('.mate1.fastq' not in fp) and ('.mate2.fastq' not in fp) and ('.singleton.fastq' not in fp))] for file_prefix in FP: file_prefix = fr + file_prefix[file_prefix.rfind('/')+1:file_prefix.index('.fastq')] read_count = hashobject.sort_read_pairs(file_prefix) if read_count > 0: print file_prefix,'READ COUNT:',str(read_count) else: print file_prefix,'NO READS' FP = glob.glob(os.path.join(inputdir+fr,'*.mate1.fastq')) for fp in FP: base_fp = fp[:fp.index('1.fastq')] fix_read_pairs(base_fp) if (os.stat(base_fp+'1.fastq.tmp').st_size > .9*os.stat(base_fp+'1.fastq').st_size) and (os.stat(base_fp+'2.fastq.tmp').st_size > .9*os.stat(base_fp+'2.fastq').st_size):
import glob,os from fastq_reader import Fastq_Reader help_message = 'usage example: python check_hash_collisions.py -r 2 -i /project/home/hashed_reads/ -o /project/home/cluster_vectors/' if __name__ == "__main__": try: opts, args = getopt.getopt(sys.argv[1:],'hr:i:o:',["--filerank=","inputdir=","outputdir="]) except: print help_message sys.exit(2) for opt, arg in opts: if opt in ('-h','--help'): print help_message sys.exit() elif opt in ('-r',"--filerank"): fr = int(arg) elif opt in ('-i','--inputdir'): inputdir = arg if inputdir[-1] != '/': inputdir += '/' elif opt in ('-o','--outputdir'): outputdir = arg if outputdir[-1] != '/': outputdir += '/' hashobject = Fastq_Reader(inputdir,outputdir) HashFiles = glob.glob(os.path.join(hashobject.input_path,'*.hashq.*')) hashobject.infile = HashFiles[fr] t,n,H = hashobject.collision_report() print 'total hashed kmers:',t print 'total pairwise collisions:',n print 'collision histogram:',H
sys.exit() elif opt in ('-r',"--filerank"): fr = int(arg)-1 elif opt in ('-i','--inputdir'): inputdir = arg if inputdir[-1] != '/': inputdir += '/' elif opt in ('-o','--outputdir'): outputdir = arg if outputdir[-1] != '/': outputdir += '/' elif opt in ('-t','--tmpdir'): tmpdir = arg if tmpdir[-1] != '/': tmpdir += '/' hashobject = Fastq_Reader(inputdir,outputdir) cp = np.load(hashobject.output_path+'cluster_probs.npy') cluster_probs = dict(enumerate(cp)) Hashq_Files = glob.glob(os.path.join(hashobject.input_path,'*.hashq.*')) Hashq_Files = [fp for fp in Hashq_Files if '.tmp' not in fp] Hashq_Files.sort() infile = Hashq_Files[fr] outpart = infile[-6:-3] sample_id = infile[infile.rfind('/')+1:infile.index('.hashq')] tmpdir += str(fr) + '/' os.system('mkdir '+tmpdir) G = [open('%s%s.%s.cols.%d' % (tmpdir,sample_id,outpart,i),'w') for i in range(0,2**hashobject.hash_size,2**hashobject.hash_size/50)] f = gzip.open(infile) r_id = 0 for a in hashobject.hash_read_generator(f): for x in a[2]:
help_message = 'usage example: python assembly_summary.py -i /project/home/' if __name__ == "__main__": try: opts, args = getopt.getopt(sys.argv[1:],'hi:',["inputdir="]) except: print help_message sys.exit(2) for opt, arg in opts: if opt in ('-h','--help'): print help_message sys.exit() elif opt in ('-i','--inputdir'): inputdir = arg if inputdir[-1] != '/': inputdir += '/' hashobject = Fastq_Reader(inputdir+'read_partitions/',inputdir+'read_partitions/') f = open(bact_names_path) BNames = cPickle.load(f) f.close() f = open(vir_names_path) VNames = cPickle.load(f) f.close() f = open(inputdir+'lib_estimates/samples_grouped_by_lib.csv') reader = csv.reader(f) Sample_ids = [] for row in reader: Sample_ids += row f.close() f_main = open(inputdir+'assembly_alignment_summary.csv','w') writer_main = csv.writer(f_main) writer_main.writerow(['partition','N50','largest contig','total bp','scaffolds','top bacterial alignment','alignment length','top viral alignment','alignment length'])
inputdir += '/' elif opt in ('-o', '--outputdir'): outputdir = arg if outputdir[-1] != '/': outputdir += '/' elif opt in ('-z', '--reversecomp'): do_reverse_compliment = False FP = glob.glob(os.path.join(inputdir, '*.fastq.*')) if len(FP) == 0: # single file per-sample FP = glob.glob(os.path.join(inputdir, '*.fastq')) file_prefix = FP[fr] file_split = file_prefix[file_prefix.index('.fastq') + 6:] file_prefix = file_prefix[file_prefix.rfind('/') + 1:file_prefix.index('.fastq')] hashobject = Fastq_Reader(inputdir, outputdir) f = open(hashobject.input_path + file_prefix + '.fastq' + file_split, 'r') read_type = hashobject.id_type(f) g = gzip.open( hashobject.output_path + file_prefix + '.hashq' + file_split + '.gz', 'wb') hashobject.hpfx = hashobject.hpfx + str(hashobject.kmer_size) + ',' A = [] reads_hashed = 0 while A != None: try: A, B = hashobject.generator_to_bins(hashobject.read_generator( f, max_reads=25000, verbose_ids=True), rc=do_reverse_compliment) for b in range(len(B)): reads_hashed += kmer_bins(B[b], A, hashobject.hpfx, g,
if inputdir[-1] != '/': inputdir += '/' elif opt in ('-o','--outputdir'): outputdir = arg if outputdir[-1] != '/': outputdir += '/' elif opt in ('-z','--reversecomp'): do_reverse_compliment = False FP = glob.glob(os.path.join(inputdir,'*.fastq.*')) if len(FP) == 0: # single file per-sample FP = glob.glob(os.path.join(inputdir,'*.fastq')) file_prefix = FP[fr] file_split = file_prefix[file_prefix.index('.fastq')+6:] file_prefix = file_prefix[file_prefix.rfind('/')+1:file_prefix.index('.fastq')] hashobject = Fastq_Reader(inputdir,outputdir) f = open(hashobject.input_path+file_prefix+'.fastq'+file_split,'r') read_type = hashobject.id_type(f) g = gzip.open(hashobject.output_path+file_prefix+'.hashq'+file_split+'.gz','wb') hashobject.hpfx = hashobject.hpfx + str(hashobject.kmer_size)+',' A = [] reads_hashed = 0 while A != None: try: A,B = hashobject.generator_to_bins(hashobject.read_generator(f,max_reads=25000,verbose_ids=True),rc=do_reverse_compliment) for b in range(len(B)): reads_hashed += kmer_bins(B[b],A,hashobject.hpfx,g,read_type) except Exception,err: pass #print str(err) f.close()
args = parser.parse_args() return args # MAIN if __name__ == "__main__": args = interface() input_dir = os.path.abspath(args.IN) if not input_dir.endswith('/'): input_dir += '/' output_dir = os.path.abspath(args.OUT) if not output_dir.endswith('/'): output_dir += '/' k_size = args.KMER h_size = args.HASH hashobject = Fastq_Reader(input_dir, output_dir, new_hash=(h_size, k_size)) total_rand_kmers = k_size * h_size * 2 print('[CreateHash] Creating {0} random k-mers in total.'.format( total_rand_kmers)) # Change max_reads to variable in future hashobject.rand_kmers_for_wheel(total_rand_kmers, max_reads=10**6) print('[CreateHash] Setting hash function.') hashobject.set_wheels(wheels=1) os.remove(input_dir + 'random_kmers.fastq') with open(output_dir + 'hashParts.txt', 'w') as f: f.write('{0}\n'.format(2**h_size / 10**6 + 1))
sys.exit() elif opt in ('-r', '--filerank'): fr = int(arg) - 1 elif opt in ('-i', '--inputdir'): inputdir = arg if inputdir[-1] != '/': inputdir += '/' elif opt in ('-o', '--outputdir'): outputdir = arg if outputdir[-1] != '/': outputdir += '/' elif opt in ('-l', '--libdir'): libdir = arg if libdir[-1] != '/': libdir += '/' hashobject = Fastq_Reader(inputdir, outputdir) Read_Partitions = glob.glob(os.path.join(hashobject.input_path, '*.fastq')) Read_Partitions = [ fp for fp in Read_Partitions if ('.pairs.' not in fp) and ('.singleton.' not in fp) ] Read_Partitions = list( set([ fp[fp.rfind('/') + 1:fp.index('.cluster') + 8] for fp in Read_Partitions ])) Processed_Partitions = glob.glob( os.path.join(hashobject.output_path, '*.cluster_velvet/contigs.fa')) Processed_Partitions = [ fp[len(hashobject.output_path):fp.index('.cluster') + 8] for fp in Processed_Partitions
sys.exit(2) for opt, arg in opts: if opt in ('-h','--help'): print help_message sys.exit() elif opt in ('-r',"--filerank"): fr = int(arg)-1 elif opt in ('-i','--inputdir'): inputdir = arg if inputdir[-1] != '/': inputdir += '/' elif opt in ('-o','--outputdir'): outputdir = arg if outputdir[-1] != '/': outputdir += '/' hashobject = Fastq_Reader(inputdir,outputdir) Hashq_Files = glob.glob(os.path.join(hashobject.input_path,'*.hashq.*')) hashobject.infile = Hashq_Files[fr] hashobject.outfile = hashobject.output_path + 'intermediate_clusters/' + str(fr) hashobject.global_weights = np.load(hashobject.output_path + 'global_weights.npy') global_weight_sum = hashobject.global_weights.sum(dtype=np.float64) Cluster_Files = glob.glob(os.path.join(hashobject.output_path,'*.cluster.npy')) Cluster_Files = [(int(cf[cf.rfind('/')+1:cf.index('.')]),cf) for cf in Cluster_Files] cluster_sizes = np.load(hashobject.output_path+'kmer_cluster_sizes.npy') total_set_size = 0 cluster_weights = [] cluster_keys = [] outpart = 0 for ci,cf in Cluster_Files: # ignore super clusters and super small clusters if cluster_sizes[ci] < 0.2*2**hashobject.hash_size:
input_dir = os.path.abspath(args.IN) if not input_dir.endswith('/'): input_dir += '/' output_dir = os.path.abspath(args.OUT) if not output_dir.endswith('/'): output_dir += '/' tmp_dir = os.path.abspath(args.TMP) if not tmp_dir.endswith('/'): tmp_dir += '/' task_rank = args.task_rank - 1 hashobject = Fastq_Reader(input_dir, output_dir) cp = np.load(hashobject.output_path + 'cluster_probs.npy') cluster_probs = dict(enumerate(cp)) Hashq_Files = glob.glob(os.path.join(hashobject.input_path, '*.hashq.*')) Hashq_Files = [fp for fp in Hashq_Files if '.tmp' not in fp] Hashq_Files.sort() infile = Hashq_Files[task_rank] outpart = infile[-6:-3] sample_id = infile[infile.rfind('/') + 1:infile.index('.hashq')] tmp_dir += 'tmp{0}/'.format(task_rank) os.system('mkdir ' + tmp_dir) G = [ open('{0}{1}.{2}.cols.{3}'.format(tmp_dir, sample_id, outpart, i), 'w') for i in range(0, 2**hashobject.hash_size, 2**hashobject.hash_size /
sys.exit() elif opt in ('-r','--filerank'): fr = int(arg) - 1 elif opt in ('-i','--inputdir'): inputdir = arg if inputdir[-1] != '/': inputdir += '/' elif opt in ('-o','--outputdir'): outputdir = arg if outputdir[-1] != '/': outputdir += '/' elif opt in ('-l','--libdir'): libdir = arg if libdir[-1] != '/': libdir += '/' hashobject = Fastq_Reader(inputdir,outputdir) Read_Partitions = glob.glob(os.path.join(hashobject.input_path,'*.fastq')) Read_Partitions = [fp for fp in Read_Partitions if ('.pairs.' not in fp) and ('.singleton.' not in fp)] Read_Partitions = list(set([fp[fp.rfind('/')+1:fp.index('.cluster')+8] for fp in Read_Partitions])) Processed_Partitions = glob.glob(os.path.join(hashobject.output_path,'*.cluster_velvet/contigs.fa')) Processed_Partitions = [fp[len(hashobject.output_path):fp.index('.cluster')+8] for fp in Processed_Partitions] rp = Read_Partitions[fr] if rp not in Processed_Partitions: f = open(libdir+'samples_grouped_by_lib.csv') reader = csv.reader(f) hashobject.sample_library = {} i = 0 for row in reader: for sample in row: hashobject.sample_library[sample] = i i += 1
output_dir += '/' task_rank = args.task_rank - 1 do_reverse_compliment = args.rev_comp FP = glob.glob(os.path.join(input_dir, '*.fastq.*')) if len(FP) == 0: # single file per-sample FP = glob.glob(os.path.join(input_dir, '*.fastq')) file_prefix = FP[task_rank] file_split = file_prefix[file_prefix.index('.fastq') + 6:] file_prefix = file_prefix[file_prefix.rfind('/') + 1:file_prefix.index('.fastq')] hashobject = Fastq_Reader(input_dir, output_dir) reads_file_name = hashobject.input_path + file_prefix + '.fastq' + file_split with Fq.open_gz(reads_file_name) as f: hashobject.quality_codes = Fq.set_quality_codes(reads_file_name) print(reads_file_name) with gzip.open( hashobject.output_path + file_prefix + '.hashq' + file_split + '.gz', 'wt') as g: IDs = [] reads_hashed = 0 print("[HashFastqReads] Starting to hash the reads.") IDs, bins = hashobject.generator_to_bins(Fq.fastq_generator(f), rc=do_reverse_compliment) print("[HashFastqReads] All k-mers hashed.")
from fastq_reader import Fastq_Reader help_message = 'usage example: python check_hash_collisions.py -r 2 -i /project/home/hashed_reads/ -o /project/home/cluster_vectors/' if __name__ == "__main__": try: opts, args = getopt.getopt(sys.argv[1:], 'hr:i:o:', ["--filerank=", "inputdir=", "outputdir="]) except: print help_message sys.exit(2) for opt, arg in opts: if opt in ('-h', '--help'): print help_message sys.exit() elif opt in ('-r', "--filerank"): fr = int(arg) elif opt in ('-i', '--inputdir'): inputdir = arg if inputdir[-1] != '/': inputdir += '/' elif opt in ('-o', '--outputdir'): outputdir = arg if outputdir[-1] != '/': outputdir += '/' hashobject = Fastq_Reader(inputdir, outputdir) HashFiles = glob.glob(os.path.join(hashobject.input_path, '*.hashq.*')) hashobject.infile = HashFiles[fr] t, n, H = hashobject.collision_report() print 'total hashed kmers:', t print 'total pairwise collisions:', n print 'collision histogram:', H
seen_add = seen.add return [x for x in array if not (x in seen or seen_add(x))] # MAIN if __name__ == "__main__": args = interface() input_dir = os.path.abspath(args.IN) if not input_dir.endswith('/'): input_dir += '/' output_dir = os.path.abspath(args.OUT) if not output_dir.endswith('/'): output_dir += '/' task_rank = args.task_rank - 1 FP = glob.glob(os.path.join(input_dir, '*.hashq.*')) FP = [fp[fp.rfind('/') + 1:] for fp in FP] FP = list(unique([fp[:fp.index('.')] for fp in FP])) file_prefix = FP[task_rank] hashobject = Fastq_Reader(input_dir, output_dir) H = hashobject.merge_count_fractions(file_prefix) H = np.array(H, dtype=np.uint16) nz = np.nonzero(H)[0] np.save(hashobject.output_path + file_prefix + '.nonzero.npy', nz) print('Sample {0} has {1} nonzero elements and {2} total observed kmers'. format(file_prefix, len(nz), H.sum()))
except: print help_message sys.exit(2) for opt, arg in opts: if opt in ('-h', '--help'): print help_message sys.exit() elif opt in ('-i', '--inputdir'): inputdir = arg if inputdir[-1] != '/': inputdir += '/' elif opt in ('-o', '--outputdir'): outputdir = arg if outputdir[-1] != '/': outputdir += '/' hashobject = Fastq_Reader(outputdir, outputdir) FP = glob.glob(os.path.join(inputdir, '*.fastq.*')) FP = set([fp[fp.rfind('/') + 1:fp.index('.fastq')] for fp in FP]) LibSizes = [] for sample in FP: os.system('head -n600000 ' + inputdir + sample + '.fastq.aa > ' + outputdir + sample + '.fastq') sample_reads = hashobject.sort_read_pairs(sample) if sample_reads > 0: velvetdir = outputdir + sample + '_velvet/' os.system('mkdir ' + velvetdir) os.system( '/import/analysis/comp_bio/metagenomics/src/velvet/velveth ' + velvetdir + ' 31 -fastq -short ' + outputdir + sample + '.singleton.fastq -shortPaired ' + outputdir + sample + '.pairs.fastq')
opts, args = getopt.getopt(sys.argv[1:], 'hr:i:o:', ["filerank=", "inputdir=", "outputdir="]) except: print help_message sys.exit(2) for opt, arg in opts: if opt in ('-h', '--help'): print help_message sys.exit() elif opt in ('-r', "--filerank"): fr = int(arg) - 1 elif opt in ('-i', '--inputdir'): inputdir = arg elif opt in ('-o', '--outputdir'): outputdir = arg hashobject = Fastq_Reader(inputdir, outputdir) fr = str(fr) + '/' FP = glob.glob(os.path.join(inputdir + fr, '*.fastq')) FP = [ fp for fp in FP if (('.mate1.fastq' not in fp) and ( '.mate2.fastq' not in fp) and ('.singleton.fastq' not in fp)) ] for file_prefix in FP: file_prefix = fr + file_prefix[file_prefix.rfind('/') + 1:file_prefix.index('.fastq')] read_count = hashobject.sort_read_pairs(file_prefix) if read_count > 0: print file_prefix, 'READ COUNT:', str(read_count) else: print file_prefix, 'NO READS' FP = glob.glob(os.path.join(inputdir + fr, '*.mate1.fastq'))
sys.exit() elif opt in ('-r', "--filerank"): fr = int(arg) - 1 elif opt in ('-i', '--inputdir'): inputdir = arg if inputdir[-1] != '/': inputdir += '/' elif opt in ('-o', '--outputdir'): outputdir = arg if outputdir[-1] != '/': outputdir += '/' elif opt in ('-t', '--tmpdir'): tmpdir = arg if tmpdir[-1] != '/': tmpdir += '/' hashobject = Fastq_Reader(inputdir, outputdir) cp = np.load(hashobject.output_path + 'cluster_probs.npy') cluster_probs = dict(enumerate(cp)) Hashq_Files = glob.glob(os.path.join(hashobject.input_path, '*.hashq.*')) Hashq_Files = [fp for fp in Hashq_Files if '.tmp' not in fp] Hashq_Files.sort() infile = Hashq_Files[fr] outpart = infile[-6:-3] sample_id = infile[infile.rfind('/') + 1:infile.index('.hashq')] tmpdir += str(fr) + '/' os.system('mkdir ' + tmpdir) G = [ open('%s%s.%s.cols.%d' % (tmpdir, sample_id, outpart, i), 'w') for i in range(0, 2**hashobject.hash_size, 2**hashobject.hash_size / 50) ]
seen = set() seen_add = seen.add return [x for x in array if not (x in seen or seen_add(x))] # MAIN if __name__ == "__main__": args = interface() input_dir = os.path.abspath(args.IN) if not input_dir.endswith('/'): input_dir += '/' output_dir = os.path.abspath(args.OUT) if not output_dir.endswith('/'): output_dir += '/' task_rank = args.task_rank - 1 FP = glob.glob(os.path.join(input_dir, '*.hashq.*')) FP = [fp[fp.rfind('/') + 1:] for fp in FP] FP = list(unique([fp[:fp.index('.')] for fp in FP])) file_prefix = FP[task_rank % len(FP)] print('Merging sample ' + file_prefix) # SUPER DUMB to hardcode the fraction size file_fraction = int(task_rank / len(FP)) hashobject = Fastq_Reader(input_dir, output_dir) H = hashobject.hash_counts_from_hashq(file_prefix, multi_files_fraction=file_fraction)
from fastq_reader import Fastq_Reader help_message = 'usage example: python merge_hashq_files.py -r 3 -i /project/home/hashed_reads/ -o /project/home/hashed_reads/' if __name__ == "__main__": try: opts, args = getopt.getopt(sys.argv[1:],'hr:i:o:',["filerank=","inputdir=","outputdir="]) except: print help_message sys.exit(2) for opt, arg in opts: if opt in ('-h','--help'): print help_message sys.exit() elif opt in ('-r','--filerank'): fr = int(arg) - 1 elif opt in ('-i','--inputdir'): inputdir = arg if inputdir[-1] != '/': inputdir += '/' elif opt in ('-o','--outputdir'): outputdir = arg if outputdir[-1] != '/': outputdir += '/' FP = glob.glob(os.path.join(inputdir,'*.hashq.*')) FP = [fp[fp.rfind('/')+1:] for fp in FP] FP = list(set([fp[:fp.index('.')] for fp in FP])) file_prefix = FP[fr%len(FP)] # SUPER DUMB to hardcode the fraction size file_fraction = fr/len(FP) hashobject = Fastq_Reader(inputdir,outputdir) H = hashobject.hash_counts_from_hashq(file_prefix,multi_files_fraction=file_fraction)
import numpy as np from fastq_reader import Fastq_Reader help_message = 'usage example: python merge_hashq_files.py -r 3 -i /project/home/hashed_reads/ -o /project/home/hashed_reads/' if __name__ == "__main__": try: opts, args = getopt.getopt(sys.argv[1:],'hr:i:o:',["filerank=","inputdir=","outputdir="]) except: print help_message sys.exit(2) for opt, arg in opts: if opt in ('-r','--filerank'): fr = int(arg) - 1 elif opt in ('-i','--inputdir'): inputdir = arg if inputdir[-1] != '/': inputdir += '/' elif opt in ('-o','--outputdir'): outputdir = arg if outputdir[-1] != '/': outputdir += '/' FP = glob.glob(os.path.join(inputdir,'*.hashq.*')) FP = [fp[fp.rfind('/')+1:] for fp in FP] FP = list(set([fp[:fp.index('.')] for fp in FP])) file_prefix = FP[fr] hashobject = Fastq_Reader(inputdir,outputdir) H = hashobject.merge_count_fractions(file_prefix) H = np.array(H,dtype=np.uint16) nz = np.nonzero(H)[0] np.save(hashobject.output_path+file_prefix+'.nonzero.npy',nz) print 'sample %s has %d nonzero elements and %d total observed kmers' % (file_prefix,len(nz),H.sum())
outputdir = arg if outputdir[-1] != '/': outputdir += '/' elif opt in ('-k', '--kmersize'): k_size = int(arg) elif opt in ('-s', '--hashsize'): h_size = int(arg) # Detect file format fastq = True suffix = 'fastq' FP = glob.glob(os.path.join(inputdir, '*.' + suffix + '.*')) if len(FP) == 0: # single file per-sample FP = glob.glob(os.path.join(inputdir, '*.' + suffix)) if len(FP) == 0: suffix = 'fa' fastq = False FP = glob.glob(os.path.join(inputdir, '*.' + suffix)) hashobject = Fastq_Reader(inputdir, outputdir, new_hash=(h_size, k_size), fastq=fastq) total_rand_kmers = k_size * h_size * 2 hashobject.rand_kmers_for_wheel(total_rand_kmers) hashobject.set_wheels(wheels=1) os.system('rm %s/random_kmers.fastq' % inputdir) f = open(outputdir + 'hashParts.txt', 'w') f.write('%d\n' % (2**h_size / 10**6 + 1)) f.close()
except: print help_message sys.exit(2) for opt, arg in opts: if opt in ('-h','--help'): print help_message sys.exit() elif opt in ('-i','--inputdir'): inputdir = arg if inputdir[-1] != '/': inputdir += '/' elif opt in ('-o','--outputdir'): outputdir = arg if outputdir[-1] != '/': outputdir += '/' hashobject = Fastq_Reader(outputdir,outputdir) FP = glob.glob(os.path.join(inputdir,'*.fastq.*')) FP = set([fp[fp.rfind('/')+1:fp.index('.fastq')] for fp in FP]) LibSizes = [] for sample in FP: os.system('head -n600000 '+inputdir+sample+'.fastq.aa > '+outputdir+sample+'.fastq') sample_reads = hashobject.sort_read_pairs(sample) if sample_reads > 0: velvetdir = outputdir+sample+'_velvet/' os.system('mkdir '+velvetdir) os.system('/import/analysis/comp_bio/metagenomics/src/velvet/velveth '+velvetdir+' 31 -fastq -short '+outputdir+sample+'.singleton.fastq -shortPaired '+outputdir+sample+'.pairs.fastq') os.system('/import/analysis/comp_bio/metagenomics/src/velvet/velvetg '+velvetdir+' -exp_cov auto | grep -ir "Paired-end library 1 has length:" > '+velvetdir+'LibLengths') libsize = parse_velvet_log(velvetdir) if libsize != None: LibSizes.append((libsize,sample)) os.system('rm -r '+outputdir+sample+'*')
help_message = "usage example: python create_hash.py -i /project/home/original_reads/ -o /project/home/hashed_reads/ -k kmer_size -s hash_size" if __name__ == "__main__": try: opts, args = getopt.getopt(sys.argv[1:],'hi:o:k:s:',["inputdir=","outputdir=","kmersize=","hashsize="]) except: print help_message sys.exit(2) for opt, arg in opts: if opt in ('-h','--help'): print help_message sys.exit() elif opt in ('-i','--inputdir'): inputdir = arg if inputdir[-1] != '/': inputdir += '/' elif opt in ('-o','--outputdir'): outputdir = arg if outputdir[-1] != '/': outputdir += '/' elif opt in ('-k','--kmersize'): k_size = int(arg) elif opt in ('-s','--hashsize'): h_size = int(arg) hashobject = Fastq_Reader(inputdir,outputdir,new_hash=(h_size,k_size)) total_rand_kmers = k_size*h_size*2 hashobject.rand_kmers_for_wheel(total_rand_kmers) hashobject.set_wheels(wheels=1) os.system('rm %s/random_kmers.fastq' % inputdir) f = open(outputdir + 'hashParts.txt','w') f.write('%d\n' % (2**h_size/10**6 + 1)) f.close()