try:
		opts, args = getopt.getopt(sys.argv[1:],'hr:i:o:H:',["filerank=","inputdir=","outputdir=","headers"])
	except:
		print help_message
		sys.exit(2)
	for opt, arg in opts:
		if opt in ('-h','--help'):
			print help_message
			sys.exit()
		elif opt in ('-r',"--filerank"):
			fr = int(arg)-1
		elif opt in ('-i','--inputdir'):
			inputdir = arg
			if inputdir[-1] != '/':
				inputdir += '/'
		elif opt in ('-o','--outputdir'):
			outputdir = arg
			if outputdir[-1] != '/':
				outputdir += '/'
		elif opt in ('-H',"--headers"):
			headerdir = arg
			if headerdir[-1] != '/':
				headerdir += '/'
	hashobject = Fastq_Reader(inputdir,outputdir)
	FP = glob.glob(os.path.join(hashobject.input_path,'*.*'))
	FP = list(set([fp[fp.rfind('/')+1:fp.rfind('.')] for fp in FP]))
	file_group = FP[fr]
	FP = glob.glob(os.path.join(headerdir,'*.hashq.*'))
	originating_header = FP[int(file_group)]
	originating_header = originating_header[originating_header.rfind('/')+1:originating_header.index('.hashq')]
	hashobject.fastq_from_intermediate_output(file_group,originating_header)
	try:
		opts, args = getopt.getopt(sys.argv[1:],'hr:i:o:',["filerank=","inputdir=","outputdir="])
	except:
		print help_message
		sys.exit(2)
	for opt, arg in opts:
		if opt in ('-h','--help'):
			print help_message
			sys.exit()
		elif opt in ('-r',"--filerank"):
			fr = int(arg)-1
		elif opt in ('-i','--inputdir'):
			inputdir = arg
		elif opt in ('-o','--outputdir'):
			outputdir = arg
	hashobject = Fastq_Reader(inputdir,outputdir)
	fr = str(fr) + '/'
	FP = glob.glob(os.path.join(inputdir+fr,'*.fastq'))
	FP = [fp for fp in FP if (('.mate1.fastq' not in fp) and ('.mate2.fastq' not in fp) and ('.singleton.fastq' not in fp))]
	for file_prefix in FP:
		file_prefix = fr + file_prefix[file_prefix.rfind('/')+1:file_prefix.index('.fastq')]
		read_count = hashobject.sort_read_pairs(file_prefix)
		if read_count > 0:
			print file_prefix,'READ COUNT:',str(read_count)
		else:
			print file_prefix,'NO READS'
	FP = glob.glob(os.path.join(inputdir+fr,'*.mate1.fastq'))
	for fp in FP:
		base_fp = fp[:fp.index('1.fastq')]
		fix_read_pairs(base_fp)
		if (os.stat(base_fp+'1.fastq.tmp').st_size > .9*os.stat(base_fp+'1.fastq').st_size) and (os.stat(base_fp+'2.fastq.tmp').st_size > .9*os.stat(base_fp+'2.fastq').st_size):
import glob,os
from fastq_reader import Fastq_Reader

help_message = 'usage example: python check_hash_collisions.py -r 2 -i /project/home/hashed_reads/ -o /project/home/cluster_vectors/'
if __name__ == "__main__":
	try:
		opts, args = getopt.getopt(sys.argv[1:],'hr:i:o:',["--filerank=","inputdir=","outputdir="])
	except:
		print help_message
		sys.exit(2)
	for opt, arg in opts:
		if opt in ('-h','--help'):
			print help_message
			sys.exit()
		elif opt in ('-r',"--filerank"):
			fr = int(arg)
		elif opt in ('-i','--inputdir'):
			inputdir = arg
			if inputdir[-1] != '/':
				inputdir += '/'
		elif opt in ('-o','--outputdir'):
			outputdir = arg
			if outputdir[-1] != '/':
				outputdir += '/'
	hashobject = Fastq_Reader(inputdir,outputdir)
	HashFiles = glob.glob(os.path.join(hashobject.input_path,'*.hashq.*'))
	hashobject.infile = HashFiles[fr]
	t,n,H = hashobject.collision_report()
	print 'total hashed kmers:',t
	print 'total pairwise collisions:',n
	print 'collision histogram:',H
			sys.exit()
		elif opt in ('-r',"--filerank"):
			fr = int(arg)-1
		elif opt in ('-i','--inputdir'):
			inputdir = arg
			if inputdir[-1] != '/':
				inputdir += '/'
		elif opt in ('-o','--outputdir'):
			outputdir = arg
			if outputdir[-1] != '/':
				outputdir += '/'
		elif opt in ('-t','--tmpdir'):
			tmpdir = arg
			if tmpdir[-1] != '/':
				tmpdir += '/'
	hashobject = Fastq_Reader(inputdir,outputdir)
	cp = np.load(hashobject.output_path+'cluster_probs.npy')
	cluster_probs = dict(enumerate(cp))
	Hashq_Files = glob.glob(os.path.join(hashobject.input_path,'*.hashq.*'))
	Hashq_Files = [fp for fp in Hashq_Files if '.tmp' not in fp]
	Hashq_Files.sort()
	infile = Hashq_Files[fr]
	outpart = infile[-6:-3]
	sample_id = infile[infile.rfind('/')+1:infile.index('.hashq')]
	tmpdir += str(fr) + '/'
	os.system('mkdir '+tmpdir)
	G = [open('%s%s.%s.cols.%d' % (tmpdir,sample_id,outpart,i),'w') for i in range(0,2**hashobject.hash_size,2**hashobject.hash_size/50)]
	f = gzip.open(infile)
	r_id = 0
	for a in hashobject.hash_read_generator(f):
		for x in a[2]:
Example #5
0
help_message = 'usage example: python assembly_summary.py -i /project/home/'
if __name__ == "__main__":
	try:
		opts, args = getopt.getopt(sys.argv[1:],'hi:',["inputdir="])
	except:
		print help_message
		sys.exit(2)
	for opt, arg in opts:
		if opt in ('-h','--help'):
			print help_message
			sys.exit()
		elif opt in ('-i','--inputdir'):
			inputdir = arg
			if inputdir[-1] != '/':
				inputdir += '/'
	hashobject = Fastq_Reader(inputdir+'read_partitions/',inputdir+'read_partitions/')
	f = open(bact_names_path)
	BNames = cPickle.load(f)
	f.close()
	f = open(vir_names_path)
	VNames = cPickle.load(f)
	f.close()
	f = open(inputdir+'lib_estimates/samples_grouped_by_lib.csv')
	reader = csv.reader(f)
	Sample_ids = []
	for row in reader:
		Sample_ids += row
	f.close()
	f_main = open(inputdir+'assembly_alignment_summary.csv','w')
	writer_main = csv.writer(f_main)
	writer_main.writerow(['partition','N50','largest contig','total bp','scaffolds','top bacterial alignment','alignment length','top viral alignment','alignment length'])
Example #6
0
             inputdir += '/'
     elif opt in ('-o', '--outputdir'):
         outputdir = arg
         if outputdir[-1] != '/':
             outputdir += '/'
     elif opt in ('-z', '--reversecomp'):
         do_reverse_compliment = False
 FP = glob.glob(os.path.join(inputdir, '*.fastq.*'))
 if len(FP) == 0:
     # single file per-sample
     FP = glob.glob(os.path.join(inputdir, '*.fastq'))
 file_prefix = FP[fr]
 file_split = file_prefix[file_prefix.index('.fastq') + 6:]
 file_prefix = file_prefix[file_prefix.rfind('/') +
                           1:file_prefix.index('.fastq')]
 hashobject = Fastq_Reader(inputdir, outputdir)
 f = open(hashobject.input_path + file_prefix + '.fastq' + file_split, 'r')
 read_type = hashobject.id_type(f)
 g = gzip.open(
     hashobject.output_path + file_prefix + '.hashq' + file_split + '.gz',
     'wb')
 hashobject.hpfx = hashobject.hpfx + str(hashobject.kmer_size) + ','
 A = []
 reads_hashed = 0
 while A != None:
     try:
         A, B = hashobject.generator_to_bins(hashobject.read_generator(
             f, max_reads=25000, verbose_ids=True),
                                             rc=do_reverse_compliment)
         for b in range(len(B)):
             reads_hashed += kmer_bins(B[b], A, hashobject.hpfx, g,
			if inputdir[-1] != '/':
				inputdir += '/'
		elif opt in ('-o','--outputdir'):
			outputdir = arg
			if outputdir[-1] != '/':
				outputdir += '/'
		elif opt in ('-z','--reversecomp'):
			do_reverse_compliment = False
	FP = glob.glob(os.path.join(inputdir,'*.fastq.*'))
	if len(FP) == 0:
		# single file per-sample
		FP = glob.glob(os.path.join(inputdir,'*.fastq'))
	file_prefix = FP[fr]
	file_split = file_prefix[file_prefix.index('.fastq')+6:]
	file_prefix = file_prefix[file_prefix.rfind('/')+1:file_prefix.index('.fastq')]
	hashobject = Fastq_Reader(inputdir,outputdir)
	f = open(hashobject.input_path+file_prefix+'.fastq'+file_split,'r')
	read_type = hashobject.id_type(f)
	g = gzip.open(hashobject.output_path+file_prefix+'.hashq'+file_split+'.gz','wb')
	hashobject.hpfx = hashobject.hpfx + str(hashobject.kmer_size)+','
	A = []
	reads_hashed = 0
	while A != None:
		try:
			A,B = hashobject.generator_to_bins(hashobject.read_generator(f,max_reads=25000,verbose_ids=True),rc=do_reverse_compliment)
			for b in range(len(B)):
				reads_hashed += kmer_bins(B[b],A,hashobject.hpfx,g,read_type)
		except Exception,err:
			pass
			#print str(err)
	f.close()
Example #8
0
    args = parser.parse_args()
    return args


# MAIN
if __name__ == "__main__":
    args = interface()

    input_dir = os.path.abspath(args.IN)
    if not input_dir.endswith('/'):
        input_dir += '/'

    output_dir = os.path.abspath(args.OUT)
    if not output_dir.endswith('/'):
        output_dir += '/'

    k_size = args.KMER
    h_size = args.HASH

    hashobject = Fastq_Reader(input_dir, output_dir, new_hash=(h_size, k_size))
    total_rand_kmers = k_size * h_size * 2
    print('[CreateHash] Creating {0} random k-mers in total.'.format(
        total_rand_kmers))
    # Change max_reads to variable in future
    hashobject.rand_kmers_for_wheel(total_rand_kmers, max_reads=10**6)
    print('[CreateHash] Setting hash function.')
    hashobject.set_wheels(wheels=1)
    os.remove(input_dir + 'random_kmers.fastq')
    with open(output_dir + 'hashParts.txt', 'w') as f:
        f.write('{0}\n'.format(2**h_size / 10**6 + 1))
Example #9
0
         sys.exit()
     elif opt in ('-r', '--filerank'):
         fr = int(arg) - 1
     elif opt in ('-i', '--inputdir'):
         inputdir = arg
         if inputdir[-1] != '/':
             inputdir += '/'
     elif opt in ('-o', '--outputdir'):
         outputdir = arg
         if outputdir[-1] != '/':
             outputdir += '/'
     elif opt in ('-l', '--libdir'):
         libdir = arg
         if libdir[-1] != '/':
             libdir += '/'
 hashobject = Fastq_Reader(inputdir, outputdir)
 Read_Partitions = glob.glob(os.path.join(hashobject.input_path, '*.fastq'))
 Read_Partitions = [
     fp for fp in Read_Partitions
     if ('.pairs.' not in fp) and ('.singleton.' not in fp)
 ]
 Read_Partitions = list(
     set([
         fp[fp.rfind('/') + 1:fp.index('.cluster') + 8]
         for fp in Read_Partitions
     ]))
 Processed_Partitions = glob.glob(
     os.path.join(hashobject.output_path, '*.cluster_velvet/contigs.fa'))
 Processed_Partitions = [
     fp[len(hashobject.output_path):fp.index('.cluster') + 8]
     for fp in Processed_Partitions
Example #10
0
		sys.exit(2)
	for opt, arg in opts:
		if opt in ('-h','--help'):
			print help_message
			sys.exit()
		elif opt in ('-r',"--filerank"):
			fr = int(arg)-1
		elif opt in ('-i','--inputdir'):
			inputdir = arg
			if inputdir[-1] != '/':
				inputdir += '/'
		elif opt in ('-o','--outputdir'):
			outputdir = arg
			if outputdir[-1] != '/':
				outputdir += '/'
	hashobject = Fastq_Reader(inputdir,outputdir)
	Hashq_Files = glob.glob(os.path.join(hashobject.input_path,'*.hashq.*'))
	hashobject.infile = Hashq_Files[fr]
	hashobject.outfile = hashobject.output_path + 'intermediate_clusters/' + str(fr)
	hashobject.global_weights = np.load(hashobject.output_path + 'global_weights.npy')
	global_weight_sum = hashobject.global_weights.sum(dtype=np.float64)
	Cluster_Files = glob.glob(os.path.join(hashobject.output_path,'*.cluster.npy'))
	Cluster_Files = [(int(cf[cf.rfind('/')+1:cf.index('.')]),cf) for cf in Cluster_Files]
	cluster_sizes = np.load(hashobject.output_path+'kmer_cluster_sizes.npy')
	total_set_size = 0
	cluster_weights = []
	cluster_keys = []
	outpart = 0
	for ci,cf in Cluster_Files:
		# ignore super clusters and super small clusters
		if cluster_sizes[ci] < 0.2*2**hashobject.hash_size:
Example #11
0
    input_dir = os.path.abspath(args.IN)
    if not input_dir.endswith('/'):
        input_dir += '/'

    output_dir = os.path.abspath(args.OUT)
    if not output_dir.endswith('/'):
        output_dir += '/'

    tmp_dir = os.path.abspath(args.TMP)
    if not tmp_dir.endswith('/'):
        tmp_dir += '/'

    task_rank = args.task_rank - 1

    hashobject = Fastq_Reader(input_dir, output_dir)
    cp = np.load(hashobject.output_path + 'cluster_probs.npy')
    cluster_probs = dict(enumerate(cp))
    Hashq_Files = glob.glob(os.path.join(hashobject.input_path, '*.hashq.*'))
    Hashq_Files = [fp for fp in Hashq_Files if '.tmp' not in fp]
    Hashq_Files.sort()

    infile = Hashq_Files[task_rank]
    outpart = infile[-6:-3]
    sample_id = infile[infile.rfind('/') + 1:infile.index('.hashq')]
    tmp_dir += 'tmp{0}/'.format(task_rank)
    os.system('mkdir ' + tmp_dir)

    G = [
        open('{0}{1}.{2}.cols.{3}'.format(tmp_dir, sample_id, outpart, i), 'w')
        for i in range(0, 2**hashobject.hash_size, 2**hashobject.hash_size /
			sys.exit()
		elif opt in ('-r','--filerank'):
			fr = int(arg) - 1
		elif opt in ('-i','--inputdir'):
			inputdir = arg
			if inputdir[-1] != '/':
				inputdir += '/'
		elif opt in ('-o','--outputdir'):
			outputdir = arg
			if outputdir[-1] != '/':
				outputdir += '/'
		elif opt in ('-l','--libdir'):
			libdir = arg
			if libdir[-1] != '/':
				libdir += '/'
	hashobject = Fastq_Reader(inputdir,outputdir)
	Read_Partitions = glob.glob(os.path.join(hashobject.input_path,'*.fastq'))
	Read_Partitions = [fp for fp in Read_Partitions if ('.pairs.' not in fp) and ('.singleton.' not in fp)]
	Read_Partitions = list(set([fp[fp.rfind('/')+1:fp.index('.cluster')+8] for fp in Read_Partitions]))
	Processed_Partitions = glob.glob(os.path.join(hashobject.output_path,'*.cluster_velvet/contigs.fa'))
	Processed_Partitions = [fp[len(hashobject.output_path):fp.index('.cluster')+8] for fp in Processed_Partitions]
	rp = Read_Partitions[fr]
	if rp not in Processed_Partitions:
		f = open(libdir+'samples_grouped_by_lib.csv')
		reader = csv.reader(f)
		hashobject.sample_library = {}
		i = 0
		for row in reader:
			for sample in row:
				hashobject.sample_library[sample] = i
			i += 1
Example #13
0
        output_dir += '/'

    task_rank = args.task_rank - 1
    do_reverse_compliment = args.rev_comp

    FP = glob.glob(os.path.join(input_dir, '*.fastq.*'))
    if len(FP) == 0:
        # single file per-sample
        FP = glob.glob(os.path.join(input_dir, '*.fastq'))

    file_prefix = FP[task_rank]
    file_split = file_prefix[file_prefix.index('.fastq') + 6:]
    file_prefix = file_prefix[file_prefix.rfind('/') +
                              1:file_prefix.index('.fastq')]

    hashobject = Fastq_Reader(input_dir, output_dir)
    reads_file_name = hashobject.input_path + file_prefix + '.fastq' + file_split

    with Fq.open_gz(reads_file_name) as f:
        hashobject.quality_codes = Fq.set_quality_codes(reads_file_name)
        print(reads_file_name)
        with gzip.open(
                hashobject.output_path + file_prefix + '.hashq' + file_split +
                '.gz', 'wt') as g:
            IDs = []
            reads_hashed = 0
            print("[HashFastqReads] Starting to hash the reads.")
            IDs, bins = hashobject.generator_to_bins(Fq.fastq_generator(f),
                                                     rc=do_reverse_compliment)

            print("[HashFastqReads] All k-mers hashed.")
from fastq_reader import Fastq_Reader

help_message = 'usage example: python check_hash_collisions.py -r 2 -i /project/home/hashed_reads/ -o /project/home/cluster_vectors/'
if __name__ == "__main__":
    try:
        opts, args = getopt.getopt(sys.argv[1:], 'hr:i:o:',
                                   ["--filerank=", "inputdir=", "outputdir="])
    except:
        print help_message
        sys.exit(2)
    for opt, arg in opts:
        if opt in ('-h', '--help'):
            print help_message
            sys.exit()
        elif opt in ('-r', "--filerank"):
            fr = int(arg)
        elif opt in ('-i', '--inputdir'):
            inputdir = arg
            if inputdir[-1] != '/':
                inputdir += '/'
        elif opt in ('-o', '--outputdir'):
            outputdir = arg
            if outputdir[-1] != '/':
                outputdir += '/'
    hashobject = Fastq_Reader(inputdir, outputdir)
    HashFiles = glob.glob(os.path.join(hashobject.input_path, '*.hashq.*'))
    hashobject.infile = HashFiles[fr]
    t, n, H = hashobject.collision_report()
    print 'total hashed kmers:', t
    print 'total pairwise collisions:', n
    print 'collision histogram:', H
    seen_add = seen.add
    return [x for x in array if not (x in seen or seen_add(x))]


# MAIN
if __name__ == "__main__":
    args = interface()

    input_dir = os.path.abspath(args.IN)
    if not input_dir.endswith('/'):
        input_dir += '/'

    output_dir = os.path.abspath(args.OUT)
    if not output_dir.endswith('/'):
        output_dir += '/'

    task_rank = args.task_rank - 1

    FP = glob.glob(os.path.join(input_dir, '*.hashq.*'))
    FP = [fp[fp.rfind('/') + 1:] for fp in FP]
    FP = list(unique([fp[:fp.index('.')] for fp in FP]))
    file_prefix = FP[task_rank]

    hashobject = Fastq_Reader(input_dir, output_dir)
    H = hashobject.merge_count_fractions(file_prefix)
    H = np.array(H, dtype=np.uint16)
    nz = np.nonzero(H)[0]
    np.save(hashobject.output_path + file_prefix + '.nonzero.npy', nz)

    print('Sample {0} has {1} nonzero elements and {2} total observed kmers'.
          format(file_prefix, len(nz), H.sum()))
Example #16
0
 except:
     print help_message
     sys.exit(2)
 for opt, arg in opts:
     if opt in ('-h', '--help'):
         print help_message
         sys.exit()
     elif opt in ('-i', '--inputdir'):
         inputdir = arg
         if inputdir[-1] != '/':
             inputdir += '/'
     elif opt in ('-o', '--outputdir'):
         outputdir = arg
         if outputdir[-1] != '/':
             outputdir += '/'
 hashobject = Fastq_Reader(outputdir, outputdir)
 FP = glob.glob(os.path.join(inputdir, '*.fastq.*'))
 FP = set([fp[fp.rfind('/') + 1:fp.index('.fastq')] for fp in FP])
 LibSizes = []
 for sample in FP:
     os.system('head -n600000 ' + inputdir + sample + '.fastq.aa > ' +
               outputdir + sample + '.fastq')
     sample_reads = hashobject.sort_read_pairs(sample)
     if sample_reads > 0:
         velvetdir = outputdir + sample + '_velvet/'
         os.system('mkdir ' + velvetdir)
         os.system(
             '/import/analysis/comp_bio/metagenomics/src/velvet/velveth ' +
             velvetdir + ' 31 -fastq -short ' + outputdir + sample +
             '.singleton.fastq -shortPaired ' + outputdir + sample +
             '.pairs.fastq')
     opts, args = getopt.getopt(sys.argv[1:], 'hr:i:o:',
                                ["filerank=", "inputdir=", "outputdir="])
 except:
     print help_message
     sys.exit(2)
 for opt, arg in opts:
     if opt in ('-h', '--help'):
         print help_message
         sys.exit()
     elif opt in ('-r', "--filerank"):
         fr = int(arg) - 1
     elif opt in ('-i', '--inputdir'):
         inputdir = arg
     elif opt in ('-o', '--outputdir'):
         outputdir = arg
 hashobject = Fastq_Reader(inputdir, outputdir)
 fr = str(fr) + '/'
 FP = glob.glob(os.path.join(inputdir + fr, '*.fastq'))
 FP = [
     fp for fp in FP if (('.mate1.fastq' not in fp) and (
         '.mate2.fastq' not in fp) and ('.singleton.fastq' not in fp))
 ]
 for file_prefix in FP:
     file_prefix = fr + file_prefix[file_prefix.rfind('/') +
                                    1:file_prefix.index('.fastq')]
     read_count = hashobject.sort_read_pairs(file_prefix)
     if read_count > 0:
         print file_prefix, 'READ COUNT:', str(read_count)
     else:
         print file_prefix, 'NO READS'
 FP = glob.glob(os.path.join(inputdir + fr, '*.mate1.fastq'))
Example #18
0
         sys.exit()
     elif opt in ('-r', "--filerank"):
         fr = int(arg) - 1
     elif opt in ('-i', '--inputdir'):
         inputdir = arg
         if inputdir[-1] != '/':
             inputdir += '/'
     elif opt in ('-o', '--outputdir'):
         outputdir = arg
         if outputdir[-1] != '/':
             outputdir += '/'
     elif opt in ('-t', '--tmpdir'):
         tmpdir = arg
         if tmpdir[-1] != '/':
             tmpdir += '/'
 hashobject = Fastq_Reader(inputdir, outputdir)
 cp = np.load(hashobject.output_path + 'cluster_probs.npy')
 cluster_probs = dict(enumerate(cp))
 Hashq_Files = glob.glob(os.path.join(hashobject.input_path, '*.hashq.*'))
 Hashq_Files = [fp for fp in Hashq_Files if '.tmp' not in fp]
 Hashq_Files.sort()
 infile = Hashq_Files[fr]
 outpart = infile[-6:-3]
 sample_id = infile[infile.rfind('/') + 1:infile.index('.hashq')]
 tmpdir += str(fr) + '/'
 os.system('mkdir ' + tmpdir)
 G = [
     open('%s%s.%s.cols.%d' % (tmpdir, sample_id, outpart, i), 'w')
     for i in range(0, 2**hashobject.hash_size, 2**hashobject.hash_size /
                    50)
 ]
    seen = set()
    seen_add = seen.add
    return [x for x in array if not (x in seen or seen_add(x))]


# MAIN
if __name__ == "__main__":
    args = interface()

    input_dir = os.path.abspath(args.IN)
    if not input_dir.endswith('/'):
        input_dir += '/'

    output_dir = os.path.abspath(args.OUT)
    if not output_dir.endswith('/'):
        output_dir += '/'

    task_rank = args.task_rank - 1

    FP = glob.glob(os.path.join(input_dir, '*.hashq.*'))
    FP = [fp[fp.rfind('/') + 1:] for fp in FP]
    FP = list(unique([fp[:fp.index('.')] for fp in FP]))
    file_prefix = FP[task_rank % len(FP)]

    print('Merging sample ' + file_prefix)

    # SUPER DUMB to hardcode the fraction size
    file_fraction = int(task_rank / len(FP))
    hashobject = Fastq_Reader(input_dir, output_dir)
    H = hashobject.hash_counts_from_hashq(file_prefix,
                                          multi_files_fraction=file_fraction)
from fastq_reader import Fastq_Reader

help_message = 'usage example: python merge_hashq_files.py -r 3 -i /project/home/hashed_reads/ -o /project/home/hashed_reads/'
if __name__ == "__main__":
	try:
		opts, args = getopt.getopt(sys.argv[1:],'hr:i:o:',["filerank=","inputdir=","outputdir="])
	except:
		print help_message
		sys.exit(2)
	for opt, arg in opts:
		if opt in ('-h','--help'):
			print help_message
			sys.exit()
		elif opt in ('-r','--filerank'):
			fr = int(arg) - 1
		elif opt in ('-i','--inputdir'):
			inputdir = arg
			if inputdir[-1] != '/':
				inputdir += '/'
		elif opt in ('-o','--outputdir'):
			outputdir = arg
			if outputdir[-1] != '/':
				outputdir += '/'
	FP = glob.glob(os.path.join(inputdir,'*.hashq.*'))
	FP = [fp[fp.rfind('/')+1:] for fp in FP]
	FP = list(set([fp[:fp.index('.')] for fp in FP]))
	file_prefix = FP[fr%len(FP)]
	# SUPER DUMB to hardcode the fraction size
	file_fraction = fr/len(FP)
	hashobject = Fastq_Reader(inputdir,outputdir)
	H = hashobject.hash_counts_from_hashq(file_prefix,multi_files_fraction=file_fraction)
import numpy as np
from fastq_reader import Fastq_Reader

help_message = 'usage example: python merge_hashq_files.py -r 3 -i /project/home/hashed_reads/ -o /project/home/hashed_reads/'
if __name__ == "__main__":
	try:
		opts, args = getopt.getopt(sys.argv[1:],'hr:i:o:',["filerank=","inputdir=","outputdir="])
	except:
		print help_message
		sys.exit(2)
	for opt, arg in opts:
		if opt in ('-r','--filerank'):
			fr = int(arg) - 1
		elif opt in ('-i','--inputdir'):
			inputdir = arg
			if inputdir[-1] != '/':
				inputdir += '/'
		elif opt in ('-o','--outputdir'):
			outputdir = arg
			if outputdir[-1] != '/':
				outputdir += '/'
	FP = glob.glob(os.path.join(inputdir,'*.hashq.*'))
	FP = [fp[fp.rfind('/')+1:] for fp in FP]
	FP = list(set([fp[:fp.index('.')] for fp in FP]))
	file_prefix = FP[fr]
	hashobject = Fastq_Reader(inputdir,outputdir)
	H = hashobject.merge_count_fractions(file_prefix)
	H = np.array(H,dtype=np.uint16)
	nz = np.nonzero(H)[0]
	np.save(hashobject.output_path+file_prefix+'.nonzero.npy',nz)
	print 'sample %s has %d nonzero elements and %d total observed kmers' % (file_prefix,len(nz),H.sum())
Example #22
0
            outputdir = arg
            if outputdir[-1] != '/':
                outputdir += '/'
        elif opt in ('-k', '--kmersize'):
            k_size = int(arg)
        elif opt in ('-s', '--hashsize'):
            h_size = int(arg)

    # Detect file format
    fastq = True
    suffix = 'fastq'
    FP = glob.glob(os.path.join(inputdir, '*.' + suffix + '.*'))
    if len(FP) == 0:
        # single file per-sample
        FP = glob.glob(os.path.join(inputdir, '*.' + suffix))
    if len(FP) == 0:
        suffix = 'fa'
        fastq = False
        FP = glob.glob(os.path.join(inputdir, '*.' + suffix))

    hashobject = Fastq_Reader(inputdir,
                              outputdir,
                              new_hash=(h_size, k_size),
                              fastq=fastq)
    total_rand_kmers = k_size * h_size * 2
    hashobject.rand_kmers_for_wheel(total_rand_kmers)
    hashobject.set_wheels(wheels=1)
    os.system('rm %s/random_kmers.fastq' % inputdir)
    f = open(outputdir + 'hashParts.txt', 'w')
    f.write('%d\n' % (2**h_size / 10**6 + 1))
    f.close()
	except:
		print help_message
		sys.exit(2)
	for opt, arg in opts:
		if opt in ('-h','--help'):
			print help_message
			sys.exit()
		elif opt in ('-i','--inputdir'):
			inputdir = arg
			if inputdir[-1] != '/':
				inputdir += '/'
		elif opt in ('-o','--outputdir'):
			outputdir = arg
			if outputdir[-1] != '/':
				outputdir += '/'
	hashobject = Fastq_Reader(outputdir,outputdir)
	FP = glob.glob(os.path.join(inputdir,'*.fastq.*'))
	FP = set([fp[fp.rfind('/')+1:fp.index('.fastq')] for fp in FP])
	LibSizes = []
	for sample in FP:
		os.system('head -n600000 '+inputdir+sample+'.fastq.aa > '+outputdir+sample+'.fastq')
		sample_reads = hashobject.sort_read_pairs(sample)
		if sample_reads > 0:
			velvetdir = outputdir+sample+'_velvet/'
			os.system('mkdir '+velvetdir)
			os.system('/import/analysis/comp_bio/metagenomics/src/velvet/velveth '+velvetdir+' 31 -fastq -short '+outputdir+sample+'.singleton.fastq -shortPaired '+outputdir+sample+'.pairs.fastq')
			os.system('/import/analysis/comp_bio/metagenomics/src/velvet/velvetg '+velvetdir+' -exp_cov auto | grep -ir "Paired-end library 1 has length:" > '+velvetdir+'LibLengths')
			libsize = parse_velvet_log(velvetdir)
			if libsize != None:
				LibSizes.append((libsize,sample))
		os.system('rm -r '+outputdir+sample+'*')
help_message = "usage example: python create_hash.py -i /project/home/original_reads/ -o /project/home/hashed_reads/ -k kmer_size -s hash_size"
if __name__ == "__main__":
	try:
		opts, args = getopt.getopt(sys.argv[1:],'hi:o:k:s:',["inputdir=","outputdir=","kmersize=","hashsize="])
	except:
		print help_message
		sys.exit(2)
	for opt, arg in opts:
		if opt in ('-h','--help'):
			print help_message
			sys.exit()
		elif opt in ('-i','--inputdir'):
			inputdir = arg
			if inputdir[-1] != '/':
				inputdir += '/'
		elif opt in ('-o','--outputdir'):
			outputdir = arg
			if outputdir[-1] != '/':
				outputdir += '/'
		elif opt in ('-k','--kmersize'):
			k_size = int(arg)
		elif opt in ('-s','--hashsize'):
			h_size = int(arg)
	hashobject = Fastq_Reader(inputdir,outputdir,new_hash=(h_size,k_size))
	total_rand_kmers = k_size*h_size*2
	hashobject.rand_kmers_for_wheel(total_rand_kmers)
	hashobject.set_wheels(wheels=1)
	os.system('rm %s/random_kmers.fastq' % inputdir)
	f = open(outputdir + 'hashParts.txt','w')
	f.write('%d\n' % (2**h_size/10**6 + 1))
	f.close()