コード例 #1
0
import glob, os
import numpy as np
from streaming_eigenhashes import StreamingEigenhashes

help_message = "usage example: python kmer_corpus.py -i /project/home/hashed_reads/ -o /project/home/cluster_vectors/"
if __name__ == "__main__":
    try:
        opts, args = getopt.getopt(sys.argv[1:], "hi:o:r:", ["inputdir=", "outputdir=", "filerank="])
    except:
        print help_message
        sys.exit(2)
    for opt, arg in opts:
        if opt in ("-h", "--help"):
            print help_message
            sys.exit()
        elif opt in ("-r", "--filerank"):
            fr = int(arg) - 1
        elif opt in ("-i", "--inputdir"):
            inputdir = arg
            if inputdir[-1] != "/":
                inputdir += "/"
        elif opt in ("-o", "--outputdir"):
            outputdir = arg
            if outputdir[-1] != "/":
                outputdir += "/"
    hashobject = StreamingEigenhashes(inputdir, outputdir, get_pool=False)
    Kmer_Hash_Count_Files = glob.glob(os.path.join(hashobject.input_path, "*.count.hash"))
    # M = np.load(hashobject.input_path+'column_mask.npy')
    M = []
    hashobject.kmer_corpus_to_disk(Kmer_Hash_Count_Files[fr], mask=M)
コード例 #2
0
import numpy as np
from streaming_eigenhashes import StreamingEigenhashes

help_message = 'usage example: python kmer_clusters.py -i /project/home/hashed_reads/ -o /project/home/cluster_vectors/'
if __name__ == "__main__":
    try:
        opts, args = getopt.getopt(sys.argv[1:], 'hi:o:',
                                   ["inputdir=", "outputdir="])
    except:
        print help_message
        sys.exit(2)
    for opt, arg in opts:
        if opt in ('-h', '--help'):
            print help_message
            sys.exit()
        elif opt in ('-i', '--inputdir'):
            inputdir = arg
            if inputdir[-1] != '/':
                inputdir += '/'
        elif opt in ('-o', '--outputdir'):
            outputdir = arg
            if outputdir[-1] != '/':
                outputdir += '/'
    hashobject = StreamingEigenhashes(inputdir, outputdir, get_pool=False)
    Kmer_Hash_Count_Files = glob.glob(
        os.path.join(hashobject.input_path, '*.nonzero.npy'))
    corpus_generator = hashobject.corpus_idf_from_hash_paths(
        Kmer_Hash_Count_Files)
    hashobject.train_tfidf(corpus_generator)
    np.save(hashobject.output_path + 'global_weights.npy',
            hashobject.global_weights)
コード例 #3
0
 except:
     print help_message
     sys.exit(2)
 for opt, arg in opts:
     if opt in ('-h', '--help'):
         print help_message
         sys.exit()
     elif opt in ('-i', '--inputdir'):
         inputdir = arg
         if inputdir[-1] != '/':
             inputdir += '/'
     elif opt in ('-o', '--outputdir'):
         outputdir = arg
         if outputdir[-1] != '/':
             outputdir += '/'
 hashobject = StreamingEigenhashes(inputdir, outputdir, get_pool=-1)
 FP = glob.glob(os.path.join(hashobject.output_path, '*.cluster.npy'))
 FP = [(int(fp[fp.rfind('/') + 1:fp.index('.cluster')]), fp) for fp in FP]
 I = np.load(hashobject.output_path + 'cluster_index.npy')
 I = I.shape[0]
 cluster_sizes = np.zeros(I, dtype=np.uint64)
 GW = np.load(hashobject.output_path + 'global_weights.npy')
 global_weight_sum = GW.sum(dtype=np.float64)
 CP = np.zeros(I)
 X = np.zeros((2**hashobject.hash_size, 5), dtype=np.int16)
 Ix = np.zeros(2**hashobject.hash_size, dtype=np.int8)
 for i, fp in FP:
     c = np.load(fp)
     CP[i] = GW[c].sum(dtype=np.float64) / global_weight_sum
     cluster_sizes[i] = c.shape[0]
     X[c, Ix[c]] = i + 1
コード例 #4
0
    return args


# MAIN
if __name__ == "__main__":
    args = interface()

    input_dir = os.path.abspath(args.IN)
    if not input_dir.endswith('/'):
        input_dir += '/'

    output_dir = os.path.abspath(args.OUT)
    if not output_dir.endswith('/'):
        output_dir += '/'

    thresh = args.threshold

    hashobject = StreamingEigenhashes(input_dir, output_dir, get_pool=-1)
    Kmer_Hash_Count_Files = glob.glob(os.path.join(hashobject.input_path, '*.count.hash.conditioned'))
    
    hashobject.path_dict = {}
    for i in range(len(Kmer_Hash_Count_Files)):
        hashobject.path_dict[i] = Kmer_Hash_Count_Files[i]
    
    lsi = models.LsiModel.load(hashobject.output_path + 'kmer_lsi.gensim')
    hashobject.cluster_thresh = thresh
    Index = hashobject.lsi_cluster_index(lsi)
    np.save(hashobject.output_path + 'cluster_index.npy', Index)
    print('Cluster index has shape: ' + str(Index.shape))
    with open(hashobject.output_path + 'numClusters.txt', 'w') as f:
       f.write('{0}\n'.format(Index.shape[0]))
コード例 #5
0
                        dest='task_rank',
                        type=int,
                        metavar='<task_rank>',
                        help='The rank of the currant task.')

    args = parser.parse_args()
    return args


if __name__ == "__main__":
    args = interface()

    input_dir = os.path.abspath(args.IN)
    if not input_dir.endswith('/'):
        input_dir += '/'

    output_dir = os.path.abspath(args.OUT)
    if not output_dir.endswith('/'):
        output_dir += '/'

    task_rank = args.task_rank - 1

    hashobject = StreamingEigenhashes(input_dir, output_dir, get_pool=False)
    Kmer_Hash_Count_Files = glob.glob(
        os.path.join(hashobject.input_path, '*.count.hash'))
    # M = np.load(hashobject.input_path + 'column_mask.npy')
    M = []
    print("[KmerCorpus] Computing kmer corpus.")
    hashobject.kmer_corpus_to_disk(Kmer_Hash_Count_Files[task_rank], mask=M)
    print("[KmerCorpus] Done.")
コード例 #6
0
import sys, getopt
import glob, os
from gensim import corpora
import numpy as np
from streaming_eigenhashes import StreamingEigenhashes

help_message = 'usage example: python kmer_clusters.py -i /project/home/hashed_reads/ -o /project/home/cluster_vectors/'
if __name__ == "__main__":
	try:
		opts, args = getopt.getopt(sys.argv[1:],'hi:o:',["inputdir=","outputdir="])
	except:
		print help_message
		sys.exit(2)
	for opt, arg in opts:
		if opt in ('-h','--help'):
			print help_message
			sys.exit()
		elif opt in ('-i','--inputdir'):
			inputdir = arg
			if inputdir[-1] != '/':
				inputdir += '/'
		elif opt in ('-o','--outputdir'):
			outputdir = arg
			if outputdir[-1] != '/':
				outputdir += '/'
	hashobject = StreamingEigenhashes(inputdir,outputdir,get_pool=False)
	Kmer_Hash_Count_Files = glob.glob(os.path.join(hashobject.input_path,'*.nonzero.npy'))
	corpus_generator = hashobject.corpus_idf_from_hash_paths(Kmer_Hash_Count_Files)
	hashobject.train_tfidf(corpus_generator)
	np.save(hashobject.output_path+'global_weights.npy',hashobject.global_weights)
コード例 #7
0
     if opt in ('-h', '--help'):
         print help_message
         sys.exit()
     elif opt in ('-i', '--inputdir'):
         inputdir = arg
         if inputdir[-1] != '/':
             inputdir += '/'
     elif opt in ('-o', '--outputdir'):
         outputdir = arg
         if outputdir[-1] != '/':
             outputdir += '/'
     elif opt in ('-r', '--filerank'):
         fr = int(arg) - 1
     elif opt in ('-t', '--thresh'):
         thresh = float(arg)
 hashobject = StreamingEigenhashes(inputdir, outputdir, get_pool=-1)
 Kmer_Hash_Count_Files = glob.glob(
     os.path.join(hashobject.input_path, '*.count.hash.conditioned'))
 hashobject.path_dict = {}
 for i in range(len(Kmer_Hash_Count_Files)):
     hashobject.path_dict[i] = Kmer_Hash_Count_Files[i]
 lsi = models.LsiModel.load(hashobject.output_path + 'kmer_lsi.gensim')
 Index = np.load(hashobject.output_path + 'cluster_index.npy')
 i = fr * 10**6
 o = (i, min(10**6, 2**hashobject.hash_size - i))
 hashobject.cluster_thresh = thresh
 Ci = hashobject.lsi_cluster_part(o, lsi, Index)
 for ci, c in enumerate(Ci):
     try:
         np.save(hashobject.output_path + str(ci) + '/' + str(fr) + '.npy',
                 c)
コード例 #8
0
        if opt in ('-h', '--help'):
            print help_message
            sys.exit()
        elif opt in ('-i', '--inputdir'):
            inputdir = arg
            if inputdir[-1] != '/':
                inputdir += '/'
        elif opt in ('-o', '--outputdir'):
            outputdir = arg
            if outputdir[-1] != '/':
                outputdir += '/'
        elif opt in ('-p', '--numproc'):
            num_proc = int(arg)
        elif opt in ('-s', '--single'):
            singleInstance = True
    ### use -p option for multiprocessing
    num_proc = -1
    ###
    hashobject = StreamingEigenhashes(inputdir, outputdir, get_pool=num_proc)
    Kmer_Hash_Count_Files = glob.glob(
        os.path.join(hashobject.input_path, '*.count.hash.conditioned'))
    hashobject.path_dict = {}
    for i in range(len(Kmer_Hash_Count_Files)):
        hashobject.path_dict[i] = Kmer_Hash_Count_Files[i]
    corpus = hashobject.kmer_corpus_from_disk()
    # This is a hack. Should do a better job chosing num_dims
    lsi = hashobject.train_kmer_lsi(corpus,
                                    num_dims=len(hashobject.path_dict) * 4 / 5,
                                    single=singleInstance)
    lsi.save(hashobject.output_path + 'kmer_lsi.gensim')
コード例 #9
0
        sys.exit(2)
    for opt, arg in opts:
        if opt in ("-h", "--help"):
            print help_message
            sys.exit()
        elif opt in ("-i", "--inputdir"):
            inputdir = arg
            if inputdir[-1] != "/":
                inputdir += "/"
        elif opt in ("-o", "--outputdir"):
            outputdir = arg
            if outputdir[-1] != "/":
                outputdir += "/"
        elif opt in ("-r", "--filerank"):
            fr = int(arg) - 1
        elif opt in ("-t", "--thresh"):
            thresh = float(arg)
    hashobject = StreamingEigenhashes(inputdir, outputdir, get_pool=-1)
    Kmer_Hash_Count_Files = glob.glob(os.path.join(hashobject.input_path, "*.count.hash.conditioned"))
    hashobject.path_dict = {}
    for i in range(len(Kmer_Hash_Count_Files)):
        hashobject.path_dict[i] = Kmer_Hash_Count_Files[i]
    lsi = models.LsiModel.load(hashobject.output_path + "kmer_lsi.gensim")
    hashobject.cluster_thresh = thresh
    Index = hashobject.lsi_cluster_index(lsi)
    np.save(hashobject.output_path + "cluster_index.npy", Index)
    print "cluster index has shape:", Index.shape
    f = open(hashobject.output_path + "numClusters.txt", "w")
    f.write("%d\n" % Index.shape[0])
    f.close()
コード例 #10
0
		if opt in ('-h','--help'):
			print help_message
			sys.exit()
		elif opt in ('-i','--inputdir'):
			inputdir = arg
			if inputdir[-1] != '/':
				inputdir += '/'
		elif opt in ('-o','--outputdir'):
			outputdir = arg
			if outputdir[-1] != '/':
				outputdir += '/'
		elif opt in ('-r','--filerank'):
			fr = int(arg) - 1
		elif opt in ('-t','--thresh'):
			thresh = float(arg)
	hashobject = StreamingEigenhashes(inputdir,outputdir,get_pool=-1)
	Kmer_Hash_Count_Files = glob.glob(os.path.join(hashobject.input_path,'*.count.hash.conditioned'))
	hashobject.path_dict = {}
	for i in range(len(Kmer_Hash_Count_Files)):
		hashobject.path_dict[i] = Kmer_Hash_Count_Files[i]
	lsi = models.LsiModel.load(hashobject.output_path+'kmer_lsi.gensim')
	Index = np.load(hashobject.output_path+'cluster_index.npy')
	i = fr*10**6
	o = (i,min(10**6,2**hashobject.hash_size-i))
	hashobject.cluster_thresh = thresh
	Ci = hashobject.lsi_cluster_part(o,lsi,Index)
	for ci,c in enumerate(Ci):
		try:
			np.save(hashobject.output_path+str(ci)+'/'+str(fr)+'.npy',c)
		except IOError:
			os.system('mkdir '+hashobject.output_path+str(ci))
コード例 #11
0
		print help_message
		sys.exit(2)
	for opt, arg in opts:
		if opt in ('-h','--help'):
			print help_message
			sys.exit()
		elif opt in ('-i','--inputdir'):
			inputdir = arg
			if inputdir[-1] != '/':
				inputdir += '/'
		elif opt in ('-o','--outputdir'):
			outputdir = arg
			if outputdir[-1] != '/':
				outputdir += '/'
		elif opt in ('-p','--numproc'):
			num_proc = int(arg)
		elif opt in ('-s','--single'):
			singleInstance = True
	### use -p option for multiprocessing
	num_proc = -1
	###
	hashobject = StreamingEigenhashes(inputdir,outputdir,get_pool=num_proc)
	Kmer_Hash_Count_Files = glob.glob(os.path.join(hashobject.input_path,'*.count.hash.conditioned'))
	hashobject.path_dict = {}
	for i in range(len(Kmer_Hash_Count_Files)):
		hashobject.path_dict[i] = Kmer_Hash_Count_Files[i]
	corpus = hashobject.kmer_corpus_from_disk()
	# This is a hack. Should do a better job chosing num_dims
	lsi = hashobject.train_kmer_lsi(corpus,num_dims=len(hashobject.path_dict)*4/5,single=singleInstance)
	lsi.save(hashobject.output_path+'kmer_lsi.gensim')