コード例 #1
0
ファイル: drcluster.py プロジェクト: kulkarnik/DRCluster
    def __init__(self,args):
        # Store all arguments
        self.args = args
        self.base = self.args.fasta.split("/")[-1].split(".")[0]

        # Creates directory to hold parsed matrix and embedded coordinates (and other data)
        # Directory is created in the current working directory, with the base name of the fasta file
        if not (self.args.directory):
            if not os.path.exists("{}/".format(self.base)):
                os.makedirs(self.base)
            self.args.directory = "{}/".format(self.base)

        # Assign bin file based on alignment type
        if (self.args.search == 'blast'):
            self.exebin = self.args.blastpath
        elif (self.args.search == 'hmmer'):
            self.exebin = self.args.hmmerpath

        # Create dictionary of points from fastafile
        # See lib/initrun.py for more details
        self.points = initrun.read_fasta(args.fasta)
コード例 #2
0
ファイル: drcluster.py プロジェクト: kulkarnik/DRCluster
    def __init__(self, args):
        # Store all arguments
        self.args = args
        self.base = self.args.fasta.split("/")[-1].split(".")[0]

        # Creates directory to hold parsed matrix and embedded coordinates (and other data)
        # Directory is created in the current working directory, with the base name of the fasta file
        if not (self.args.directory):
            if not os.path.exists("{}/".format(self.base)):
                os.makedirs(self.base)
            self.args.directory = "{}/".format(self.base)

        # Assign bin file based on alignment type
        if (self.args.search == 'blast'):
            self.exebin = self.args.blastpath
        elif (self.args.search == 'hmmer'):
            self.exebin = self.args.hmmerpath

        # Create dictionary of points from fastafile
        # See lib/initrun.py for more details
        self.points = initrun.read_fasta(args.fasta)
コード例 #3
0
def main():
    ## Get the starting time to measure the time of the run
    print "Running script..."
    t0 = time.clock()

    ## Obtain all the info from the arguments passed
    args = readargs.arg_parser()
    print "Parsed arguments"
    """
    If necessary, run BLAST locally (try to run on a faster machine instead)
    #initrun.run_blast_tab(queryname,dbname,outfile,fmt,dbsize,ecutoff)
    """

    ## Define the paths of BLAST results file and names file
    resultsfile = args.directory + "/results.out"
    fastafile = args.directory + "/" + args.directory + ".fas"
    matrixpath = args.directory + "/temp/mds.hdf5"
    coordspath = args.directory + "/temp/" + args.directory + "_" + args.type + "_" + "coords.npy"
    jsonpath = args.directory + "/temp/file.json"
    inity = args.directory + "/temp/inity.npy"

    if (args.reinitialize):
        try:
            os.remove(inity)
        except OSError:
            pass

    ## Obtain colors and names from names file
    names, colors, lines, seqs = initrun.read_fasta(fastafile, args.format)
    print "Read FASTA file"
    ## Obtain the handle to the results file
    tabParser, tabHandle = initrun.open_file(resultsfile)
    print "Opened file"

    ## Check if coordinates have already been mapped
    if (args.load):
        print "Loading coordinates from path"
        matrix = np.load(args.load)
    elif (args.precoordinated == True):
        print "Loading coordinates from temp"
        matrix = np.load(coordspath)

    else:
        ## Check if results are preparsed
        ##
        ## If no, create an HDF5 formatted matrix and initialize
        ## Then, parse BLAST results and populate the HDF5 matrix
        ##
        ## If yes, then obtain populated matrix from file

        if args.preparsed:
            print "Retrieving matrix"
            hdfmat = initrun.get_matrix(matrixpath)
        else:
            hdfmat = initrun.create_matrix(args.value, names, matrixpath)
            print "Initialized matrix"

            print "Parsing results"
            if (args.format == 'mod'):
                results_parser.next_line_modified_format(
                    args.value, tabParser, tabHandle, names, hdfmat)
            elif (args.format == 'orig'):
                results_parser.next_line_original_format(
                    args.value, tabParser, tabHandle, names, hdfmat)

        ## Run the appropriate dimensionality reduction algorithm
        ## -mdsonly = metric MDS with sklearn's manifold package
        ## -snemds = preprocess to "points/10" dimensions with MDS, then t-SNE for reduced matrix
        ## -snepca = preprocess to "points/10" dimensions with PCA, then t-SNE for reduced matrix

        ## (still working on -n = nystrom MDS with pycogent's approximate_mds package)
        if (args.type == "mdsonly"):
            print "Performing MDS"
            matrix = mds_calc.metric_mds(hdfmat, int(args.dimension))

        elif (args.type == "snemds"):
            print "Performing t-SNE with MDS preprocessing"

            ## Partially reduce dimensionality of HDF5 matrix to 1/10th of original size or maximum of 400
            tempred = min(int(len(names) / 10), 400)
            print "Preprocessing the data using MDS..."
            print "Reducing to", tempred, "dimensions"

            tempmatrix = mds_calc.metric_mds(hdfmat, tempred)
            matrix = tsne_calc.tsne(inity,
                                    False,
                                    tempmatrix,
                                    no_dims=int(args.dimension),
                                    initial_dims=tempred)

        elif (args.type == "snepca"):
            print "Performing t-SNE with PCA preprocessing"

            ## Partially reduce dimensionality of HDF5 matrix to 1/10th of original size or maximum of 400
            tempred = min(int(len(names) / 10), 400)
            print "Reducing to", tempred, "dimensions"

            matrix = tsne_calc.tsne(inity,
                                    True,
                                    hdfmat,
                                    no_dims=int(args.dimension),
                                    initial_dims=tempred)

        elif (args.type == "sneonly"):
            print "CAUTION: Performing t-SNE on full dissimilarity matrix"

            if (len(names) > 2000):
                print "Too many proteins to perform t-SNE directly"
                sys.exit(2)
            matrix = tsne_calc.tsne(inity,
                                    False,
                                    hdfmat[...],
                                    no_dims=int(args.dimension),
                                    initial_dims=len(names))

        # model = TSNE(n_components=3,metric="precomputed")
        # matrix = model.fit_transform(hdfmat)
        #else: matrix = mds_calc.nystrom_frontend(len(names),math.sqrt(len(names)),2,mds_calc.getdist,hdfmat)

    # save coordinates to file
    np.save(coordspath, matrix)
    print "Took", time.clock() - t0, "seconds"

    #with open(jsonpath, 'w') as jsonout:
    #    json.dump(jsonconv.jsonmaker(colors,lines,matrix,args.format), jsonout, indent=2)

    ## Plot the results with matplotlib's PyPlot
    if (args.plot):
        print "Plotting", len(names), "points"
        if (int(args.dimension) == 2):
            plotter.pyplotter2d(matrix, colors, names, seqs, args.directory)

        elif (int(args.dimension) == 3):
            plotter.pyplotter3d(matrix, colors)
コード例 #4
0
def main():
    ## Get the starting time to measure the time of the run
    print "Running script..."
    t0 = time.clock()



    ## Obtain all the info from the arguments passed
    args = readargs.arg_parser()
    print "Parsed arguments"


    """
    If necessary, run BLAST locally (try to run on a faster machine instead)
    #initrun.run_blast_tab(queryname,dbname,outfile,fmt,dbsize,ecutoff)
    """


    ## Define the paths of BLAST results file and names file
    resultsfile = args.directory + "/results.out"
    fastafile = args.directory + "/" + args.directory + ".fas"
    matrixpath = args.directory + "/temp/mds.hdf5"
    coordspath = args.directory + "/temp/" + args.directory + "_"+ args.type + "_" + "coords.npy"
    jsonpath = args.directory + "/temp/file.json"
    inity = args.directory + "/temp/inity.npy"

    if (args.reinitialize):
        try:
            os.remove(inity)
        except OSError:
            pass

    ## Obtain colors and names from names file
    names,colors,lines,seqs = initrun.read_fasta(fastafile,args.format)
    print "Read FASTA file"
    ## Obtain the handle to the results file
    tabParser, tabHandle = initrun.open_file(resultsfile)
    print "Opened file"

    ## Check if coordinates have already been mapped
    if (args.load):
        print "Loading coordinates from path"
        matrix = np.load(args.load)
    elif (args.precoordinated == True):
        print "Loading coordinates from temp"
        matrix = np.load(coordspath)


    else:
        ## Check if results are preparsed
        ##
        ## If no, create an HDF5 formatted matrix and initialize
        ## Then, parse BLAST results and populate the HDF5 matrix
        ##
        ## If yes, then obtain populated matrix from file

        if args.preparsed:
            print "Retrieving matrix"
            hdfmat = initrun.get_matrix(matrixpath)
        else:
            hdfmat = initrun.create_matrix(args.value,names,matrixpath)
            print "Initialized matrix"

            print "Parsing results"
            if (args.format == 'mod'):
                results_parser.next_line_modified_format(args.value,tabParser,tabHandle,names,hdfmat)
            elif (args.format == 'orig'):
                results_parser.next_line_original_format(args.value,tabParser,tabHandle,names,hdfmat)

        ## Run the appropriate dimensionality reduction algorithm
        ## -mdsonly = metric MDS with sklearn's manifold package
        ## -snemds = preprocess to "points/10" dimensions with MDS, then t-SNE for reduced matrix
        ## -snepca = preprocess to "points/10" dimensions with PCA, then t-SNE for reduced matrix

        ## (still working on -n = nystrom MDS with pycogent's approximate_mds package)
        if (args.type == "mdsonly"):
            print "Performing MDS"
            matrix = mds_calc.metric_mds(hdfmat,int(args.dimension))

        elif (args.type == "snemds"):
            print "Performing t-SNE with MDS preprocessing"

            ## Partially reduce dimensionality of HDF5 matrix to 1/10th of original size or maximum of 400
            tempred = min(int(len(names)/10),400)
            print "Preprocessing the data using MDS..."
            print "Reducing to", tempred, "dimensions"

            tempmatrix = mds_calc.metric_mds(hdfmat,tempred)
            matrix = tsne_calc.tsne(inity,False,tempmatrix,no_dims=int(args.dimension),initial_dims=tempred)

        elif (args.type == "snepca"):
            print "Performing t-SNE with PCA preprocessing"

            ## Partially reduce dimensionality of HDF5 matrix to 1/10th of original size or maximum of 400
            tempred = min(int(len(names)/10),400)
            print "Reducing to", tempred, "dimensions"

            matrix = tsne_calc.tsne(inity,True,hdfmat,no_dims=int(args.dimension),initial_dims=tempred)

        elif (args.type == "sneonly"):
            print "CAUTION: Performing t-SNE on full dissimilarity matrix"

            if (len(names) > 2000):
                print "Too many proteins to perform t-SNE directly"
                sys.exit(2)
            matrix = tsne_calc.tsne(inity,False,hdfmat[...],no_dims=int(args.dimension),initial_dims=len(names))

        # model = TSNE(n_components=3,metric="precomputed")
        # matrix = model.fit_transform(hdfmat)
        #else: matrix = mds_calc.nystrom_frontend(len(names),math.sqrt(len(names)),2,mds_calc.getdist,hdfmat)

    # save coordinates to file
    np.save(coordspath,matrix)
    print "Took", time.clock()-t0, "seconds"

    #with open(jsonpath, 'w') as jsonout:
    #    json.dump(jsonconv.jsonmaker(colors,lines,matrix,args.format), jsonout, indent=2)

    ## Plot the results with matplotlib's PyPlot
    if (args.plot):
        print "Plotting",len(names), "points"
        if (int(args.dimension) == 2):
            plotter.pyplotter2d(matrix,colors,names,seqs,args.directory)

        elif (int(args.dimension) == 3):
            plotter.pyplotter3d(matrix,colors)