def parseOutput(self): # Parse either the jackhmmer or BLAST output matrixpath = "{}/sparsedata.txt".format(self.args.directory) if not (self.args.preparsed): tabParser, tabHandle = initrun.open_file(self.args.alignfile) row,col,data = results_parser.next_line_original_format(self.args.value, tabParser,tabHandle, self.points,self.args.search) savemat = np.vstack((row,col,data)) np.savetxt(matrixpath,savemat) scipymat = sparse.coo_matrix((data,(row,col)),shape=(len(self.points),len(self.points))) return scipymat else: savemat = initrun.get_matrix(matrixpath) row = savemat[0] col = savemat[1] data = savemat[2] scipymat = sparse.coo_matrix((data,(row,col)),shape=(len(self.points),len(self.points))) return scipymat
def parseOutput(self): # Parse either the jackhmmer or BLAST output matrixpath = "{}/sparsedata.txt".format(self.args.directory) if not (self.args.preparsed): tabParser, tabHandle = initrun.open_file(self.args.alignfile) row, col, data = results_parser.next_line_original_format( self.args.value, tabParser, tabHandle, self.points, self.args.search) savemat = np.vstack((row, col, data)) np.savetxt(matrixpath, savemat) scipymat = sparse.coo_matrix( (data, (row, col)), shape=(len(self.points), len(self.points))) return scipymat else: savemat = initrun.get_matrix(matrixpath) row = savemat[0] col = savemat[1] data = savemat[2] scipymat = sparse.coo_matrix( (data, (row, col)), shape=(len(self.points), len(self.points))) return scipymat
def main(): ## Get the starting time to measure the time of the run print "Running script..." t0 = time.clock() ## Obtain all the info from the arguments passed args = readargs.arg_parser() print "Parsed arguments" """ If necessary, run BLAST locally (try to run on a faster machine instead) #initrun.run_blast_tab(queryname,dbname,outfile,fmt,dbsize,ecutoff) """ ## Define the paths of BLAST results file and names file resultsfile = args.directory + "/results.out" fastafile = args.directory + "/" + args.directory + ".fas" matrixpath = args.directory + "/temp/mds.hdf5" coordspath = args.directory + "/temp/" + args.directory + "_" + args.type + "_" + "coords.npy" jsonpath = args.directory + "/temp/file.json" inity = args.directory + "/temp/inity.npy" if (args.reinitialize): try: os.remove(inity) except OSError: pass ## Obtain colors and names from names file names, colors, lines, seqs = initrun.read_fasta(fastafile, args.format) print "Read FASTA file" ## Obtain the handle to the results file tabParser, tabHandle = initrun.open_file(resultsfile) print "Opened file" ## Check if coordinates have already been mapped if (args.load): print "Loading coordinates from path" matrix = np.load(args.load) elif (args.precoordinated == True): print "Loading coordinates from temp" matrix = np.load(coordspath) else: ## Check if results are preparsed ## ## If no, create an HDF5 formatted matrix and initialize ## Then, parse BLAST results and populate the HDF5 matrix ## ## If yes, then obtain populated matrix from file if args.preparsed: print "Retrieving matrix" hdfmat = initrun.get_matrix(matrixpath) else: hdfmat = initrun.create_matrix(args.value, names, matrixpath) print "Initialized matrix" print "Parsing results" if (args.format == 'mod'): results_parser.next_line_modified_format( args.value, tabParser, tabHandle, names, hdfmat) elif (args.format == 'orig'): results_parser.next_line_original_format( args.value, tabParser, tabHandle, names, hdfmat) ## Run the appropriate dimensionality reduction algorithm ## -mdsonly = metric MDS with sklearn's manifold package ## -snemds = preprocess to "points/10" dimensions with MDS, then t-SNE for reduced matrix ## -snepca = preprocess to "points/10" dimensions with PCA, then t-SNE for reduced matrix ## (still working on -n = nystrom MDS with pycogent's approximate_mds package) if (args.type == "mdsonly"): print "Performing MDS" matrix = mds_calc.metric_mds(hdfmat, int(args.dimension)) elif (args.type == "snemds"): print "Performing t-SNE with MDS preprocessing" ## Partially reduce dimensionality of HDF5 matrix to 1/10th of original size or maximum of 400 tempred = min(int(len(names) / 10), 400) print "Preprocessing the data using MDS..." print "Reducing to", tempred, "dimensions" tempmatrix = mds_calc.metric_mds(hdfmat, tempred) matrix = tsne_calc.tsne(inity, False, tempmatrix, no_dims=int(args.dimension), initial_dims=tempred) elif (args.type == "snepca"): print "Performing t-SNE with PCA preprocessing" ## Partially reduce dimensionality of HDF5 matrix to 1/10th of original size or maximum of 400 tempred = min(int(len(names) / 10), 400) print "Reducing to", tempred, "dimensions" matrix = tsne_calc.tsne(inity, True, hdfmat, no_dims=int(args.dimension), initial_dims=tempred) elif (args.type == "sneonly"): print "CAUTION: Performing t-SNE on full dissimilarity matrix" if (len(names) > 2000): print "Too many proteins to perform t-SNE directly" sys.exit(2) matrix = tsne_calc.tsne(inity, False, hdfmat[...], no_dims=int(args.dimension), initial_dims=len(names)) # model = TSNE(n_components=3,metric="precomputed") # matrix = model.fit_transform(hdfmat) #else: matrix = mds_calc.nystrom_frontend(len(names),math.sqrt(len(names)),2,mds_calc.getdist,hdfmat) # save coordinates to file np.save(coordspath, matrix) print "Took", time.clock() - t0, "seconds" #with open(jsonpath, 'w') as jsonout: # json.dump(jsonconv.jsonmaker(colors,lines,matrix,args.format), jsonout, indent=2) ## Plot the results with matplotlib's PyPlot if (args.plot): print "Plotting", len(names), "points" if (int(args.dimension) == 2): plotter.pyplotter2d(matrix, colors, names, seqs, args.directory) elif (int(args.dimension) == 3): plotter.pyplotter3d(matrix, colors)
def main(): ## Get the starting time to measure the time of the run print "Running script..." t0 = time.clock() ## Obtain all the info from the arguments passed args = readargs.arg_parser() print "Parsed arguments" """ If necessary, run BLAST locally (try to run on a faster machine instead) #initrun.run_blast_tab(queryname,dbname,outfile,fmt,dbsize,ecutoff) """ ## Define the paths of BLAST results file and names file resultsfile = args.directory + "/results.out" fastafile = args.directory + "/" + args.directory + ".fas" matrixpath = args.directory + "/temp/mds.hdf5" coordspath = args.directory + "/temp/" + args.directory + "_"+ args.type + "_" + "coords.npy" jsonpath = args.directory + "/temp/file.json" inity = args.directory + "/temp/inity.npy" if (args.reinitialize): try: os.remove(inity) except OSError: pass ## Obtain colors and names from names file names,colors,lines,seqs = initrun.read_fasta(fastafile,args.format) print "Read FASTA file" ## Obtain the handle to the results file tabParser, tabHandle = initrun.open_file(resultsfile) print "Opened file" ## Check if coordinates have already been mapped if (args.load): print "Loading coordinates from path" matrix = np.load(args.load) elif (args.precoordinated == True): print "Loading coordinates from temp" matrix = np.load(coordspath) else: ## Check if results are preparsed ## ## If no, create an HDF5 formatted matrix and initialize ## Then, parse BLAST results and populate the HDF5 matrix ## ## If yes, then obtain populated matrix from file if args.preparsed: print "Retrieving matrix" hdfmat = initrun.get_matrix(matrixpath) else: hdfmat = initrun.create_matrix(args.value,names,matrixpath) print "Initialized matrix" print "Parsing results" if (args.format == 'mod'): results_parser.next_line_modified_format(args.value,tabParser,tabHandle,names,hdfmat) elif (args.format == 'orig'): results_parser.next_line_original_format(args.value,tabParser,tabHandle,names,hdfmat) ## Run the appropriate dimensionality reduction algorithm ## -mdsonly = metric MDS with sklearn's manifold package ## -snemds = preprocess to "points/10" dimensions with MDS, then t-SNE for reduced matrix ## -snepca = preprocess to "points/10" dimensions with PCA, then t-SNE for reduced matrix ## (still working on -n = nystrom MDS with pycogent's approximate_mds package) if (args.type == "mdsonly"): print "Performing MDS" matrix = mds_calc.metric_mds(hdfmat,int(args.dimension)) elif (args.type == "snemds"): print "Performing t-SNE with MDS preprocessing" ## Partially reduce dimensionality of HDF5 matrix to 1/10th of original size or maximum of 400 tempred = min(int(len(names)/10),400) print "Preprocessing the data using MDS..." print "Reducing to", tempred, "dimensions" tempmatrix = mds_calc.metric_mds(hdfmat,tempred) matrix = tsne_calc.tsne(inity,False,tempmatrix,no_dims=int(args.dimension),initial_dims=tempred) elif (args.type == "snepca"): print "Performing t-SNE with PCA preprocessing" ## Partially reduce dimensionality of HDF5 matrix to 1/10th of original size or maximum of 400 tempred = min(int(len(names)/10),400) print "Reducing to", tempred, "dimensions" matrix = tsne_calc.tsne(inity,True,hdfmat,no_dims=int(args.dimension),initial_dims=tempred) elif (args.type == "sneonly"): print "CAUTION: Performing t-SNE on full dissimilarity matrix" if (len(names) > 2000): print "Too many proteins to perform t-SNE directly" sys.exit(2) matrix = tsne_calc.tsne(inity,False,hdfmat[...],no_dims=int(args.dimension),initial_dims=len(names)) # model = TSNE(n_components=3,metric="precomputed") # matrix = model.fit_transform(hdfmat) #else: matrix = mds_calc.nystrom_frontend(len(names),math.sqrt(len(names)),2,mds_calc.getdist,hdfmat) # save coordinates to file np.save(coordspath,matrix) print "Took", time.clock()-t0, "seconds" #with open(jsonpath, 'w') as jsonout: # json.dump(jsonconv.jsonmaker(colors,lines,matrix,args.format), jsonout, indent=2) ## Plot the results with matplotlib's PyPlot if (args.plot): print "Plotting",len(names), "points" if (int(args.dimension) == 2): plotter.pyplotter2d(matrix,colors,names,seqs,args.directory) elif (int(args.dimension) == 3): plotter.pyplotter3d(matrix,colors)