Esempio n. 1
0
    def load_scores(self, gene_family):

        scores = [[0 for col in range(len(gene_family.sequences))]
                  for row in range(len(gene_family.sequences))]

        #cluster won't work if nbseqs <= 2.  In this case, we'll just throw out some scores
        if len(gene_family.sequences) == 1:
            scores[0][0] = 1
            gene_family.scores = scores
            return
        elif len(gene_family.sequences) == 2:
            scores[0][0] = 1
            scores[0][1] = 1
            scores[1][0] = 1
            scores[1][1] = 1
            gene_family.scores = scores
            return

        cldealignedfile = gene_family.seq_file + ".dealigned.fa"
        Sequence.outputSequencesToFasta(gene_family.sequences, cldealignedfile)

        clfile = gene_family.seq_file + ".clustal.fa"
        pfile = gene_family.seq_file + ".clustal.pctid"
        cmd = "clustalo -i " + cldealignedfile + " -o " + clfile + " --distmat-out=" + pfile + " --percent-id --full"

        print("EXEC " + cmd)
        if not os.path.isfile(pfile) or not "use_cache" in self.other_args:
            os.system(cmd)
        else:
            print("Actually, file exists and we'll use it.")

        d = Distances.readMatrixFile(pfile, "count", " ")
        genes = d["labels"]
        matrix = d["matrix"]

        genes_pos = {}
        for i in range(len(genes)):
            genes_pos[genes[i]] = i

        for i in range(len(gene_family.sequences)):
            for j in range(len(gene_family.sequences)):
                scores[i][j] = matrix[genes_pos[gene_family.sequences[
                    i].name]][genes_pos[gene_family.sequences[j].name]]

        gene_family.scores = scores
Esempio n. 2
0
    def load_scores(self, gene_family):

        dealignedfile = gene_family.seq_file + ".dealigned.fa"
        Sequence.outputSequencesToFasta(gene_family.sequences, dealignedfile)

        fsafile = gene_family.seq_file + ".fsa.fa"

        cmd = "fsa " + dealignedfile + " > " + fsafile

        print("EXEC " + cmd)

        if not os.path.isfile(
                fsafile
        ) or not "use_cache" in self.other_args or os.path.getsize(
                fsafile) <= 10:
            os.system(cmd)
        else:
            print("Actually, file exists and we'll use it.")

        seqs = Sequence.readSequences(fsafile)

        gene_family.scores = Distances.getPairwisePctID(sequences=seqs,
                                                        verbose=False,
                                                        run_nw_algorithm=False)
Esempio n. 3
0
#(this happens with simphy)
#so here we copy the alginment files, but rename the genes with its file index
newinfiles = ""
files_list = infiles.split(",")

newfile_to_old_file = {}

if rename_genes:
    print("Copying alignment files, ensuring gene name uniqueness...")
    for i in range(len(files_list)):
        f = files_list[i]
        filename, ext = os.path.splitext(f)
        newfile = join(workdir,
                       os.path.basename(f).replace(ext, "_" + str(i) + ".fa"))
        sequences = Sequence.readSequences(f)
        Sequence.outputSequencesToFasta(sequences, newfile, str(i), True)

        if newinfiles != "":
            newinfiles += ","
        newinfiles += newfile

        newfile_to_old_file[newfile] = f

    infiles = newinfiles
else:
    for i in range(len(files_list)):
        f = files_list[i]
        newfile_to_old_file[f] = f

#############################################################################################
# If a fullmode is set, we just call the fullmode class, which, by definition of full, takes care of every step
Esempio n. 4
0
    def predict_orthologs(self, files, workdir, speciestree_file):

        runOMA = True

        if os.path.isfile(join(workdir, "Output/OrthologousGroups.txt")
                          ) and "use_cache" in self.other_args:
            print("Will use cached output files from oma")
            runOMA = False

        #quite a few preprocessing steps are needed for OMA.
        #first, we create  directories and files required.

        dbdir = join(workdir, "DB")
        if not os.path.exists(dbdir):
            os.mkdir(dbdir)

        #OMA requires one file per species.  We split our files accordingly here.
        seqs_by_species = Sequence.combineSequenceFilesBySpecies(
            files, self.other_args["species_separator"],
            int(self.other_args["species_index"]))

        if speciestree_file == "":
            print("OMA is unpredicatable without a species tree.")

        spprefix = ""
        if "species_prefix" in self.other_args:
            spprefix = self.other_args["species_prefix"]

        doAA = False
        if "convertToAA" in self.other_args:
            doAA = True

        species_list = ""
        genes_list = []
        for key in seqs_by_species:
            outfile = join(dbdir, spprefix + key + ".fa")
            Sequence.outputSequencesToFasta(sequences=seqs_by_species[key],
                                            filename=outfile,
                                            name_suffix="",
                                            aligned=False,
                                            convertToAA=doAA,
                                            name_prefix=spprefix)

            if species_list != "":
                species_list += ","
            species_list += key

            for seq in seqs_by_species[key]:
                genes_list.append(seq.name)

        #if we use a species tree, OMA requires removing species not appearing in the files
        #sgutils can restrict the species tree to a subset of species.
        if speciestree_file != "":
            oma_sptree_file = join(workdir, "oma_species_tree.nw")

            with open(speciestree_file, 'r') as myfile:
                speciestree_newick = myfile.read().replace('\n', '')

            cmd = "OCR -m restrict_species_tree -l \"" + species_list + "\" -s \"" + speciestree_newick + "\" -o \"" + oma_sptree_file + "\""
            print("EXEC " + cmd)
            os.system(cmd)

            f = open(oma_sptree_file)
            speciestree_newick_restricted = f.readline().replace("\n", "")
            f.close()

            ################################################################
            #special case here: if only one species present, oma makes an error.
            #In this case, we make 1 gene = 1 cluster
            if "," not in speciestree_newick_restricted:
                print("Only one species found.  Will make all genes paralogs.")
                self.clusters_filenames = [join(workdir, "oma.clusters")]
                self.relations_filenames = [join(workdir, "oma.relations")]

                clusters = []
                for g in genes_list:
                    clusters.append([g])
                write_clusters(self.clusters_filenames[0], clusters)

                f = open(self.relations_filenames[0], 'w')
                tmp = ""
                for a in range(len(genes_list)):
                    for b in range(a + 1, len(genes_list)):
                        tmp += genes_list[a] + "\t" + genes_list[
                            b] + "\t" + "Paralogs;;"
                tmp = tmp[0:-2]
                f.write(tmp)
                f.close()

                return
            ################################################################

        input_type = "DNA"
        if "seqtype" in self.other_args and self.other_args["seqtype"] == "AA":
            print("Input type set to AA")
            input_type = self.other_args["seqtype"]

        #now, get into oma dir and execute it
        cwd = os.getcwd()

        os.chdir(workdir)

        os.system("rm parameters.drw")
        os.system("OMA -p")  #this creates a default config file.
        #we must edit the config to put "DNA", and give the species tree restrcted to the genes at hand.
        outcfg = ""
        f = open("parameters.drw")
        for line in f:
            line = line.replace("\n", "")
            if line.startswith("InputDataType"):
                line = "InputDataType := '" + input_type + "';"
            elif line.startswith("SpeciesTree") and speciestree_file != "":
                line = "SpeciesTree := '" + speciestree_newick_restricted + "';"

            outcfg += line + "\n"
        f.close()
        f = open("parameters.drw", 'w')
        f.write(outcfg)
        f.close()
        print("Config edited")

        cmd = "OMA -n 7"
        print("EXEC " + cmd)

        if runOMA:
            os.system(cmd)
        else:
            print("Not really - using cache instead.")

        os.chdir(cwd)

        #now that the inference is done, we translate the oma output into our format.
        #the result is a oma.clusters file and a oma.relations file.

        self.parse_groups(genes_list, workdir)
        self.parse_relations(genes_list, workdir)

        self.clusters_filenames = [join(workdir, "oma.clusters")]
        self.relations_filenames = [join(workdir, "oma.relations")]
Esempio n. 5
0
    def predict_orthologs(self, files, workdir, speciestree_file):

        workdir_seqs = join(workdir, "in")
        workdir_out = join(workdir, "out")

        allseqs = []
        for f in files:
            seqs = Sequence.readSequences(f)
            for s in seqs:
                allseqs.append(s)

        #one file per species, see OMA comments above
        if len(self.cached_clusters) == 0:
            seqs_by_species = Sequence.combineSequenceFilesBySpecies(
                files, self.other_args["species_separator"],
                int(self.other_args["species_index"]))

            isdna = True
            if "seqtype" in self.other_args and self.other_args[
                    "seqtype"] == "AA":
                print("Input type set to AA")
                isdna = False

            if not os.path.exists(workdir_seqs):
                os.mkdir(workdir_seqs)
            for key in seqs_by_species:
                outfile = join(workdir_seqs, key + ".fasta")
                Sequence.outputSequencesToFasta(sequences=seqs_by_species[key],
                                                filename=outfile,
                                                name_suffix="",
                                                aligned=False,
                                                convertToAA=isdna,
                                                name_prefix="")

            cmd = "/u/lafonman/src/orthomcl-pipeline/bin/orthomcl-pipeline -i " + workdir_seqs + " -o " + workdir_out + " -m /u/lafonman/src/orthomcl-pipeline/orthomcl.conf --nocompliant --yes"
            print("EXEC " + cmd)
            os.system(cmd)

            seen_genes = set()
            clusters = []
            clfile = join(workdir_out, "groups/groups.txt")
            f = open(clfile, 'r')
            for line in f:
                line = line.replace("\n", "")
                if line != "":
                    gz = line.split(":")[1].split()
                    cluster = set()
                    for g in gz:
                        gname = g.split("|")[1]
                        cluster.add(gname)
                        seen_genes.add(gname)
                    clusters.append(cluster)

            f.close()

            for s in allseqs:
                name = s.name
                if not name in seen_genes:
                    cl = set()
                    cl.add(name)
                    clusters.append(cl)

            self.cached_clusters = clusters
        else:
            print("USING CACHED CLUSTERS")
            clusters = self.cached_clusters

        #restrict clusters to current family
        #f_set = set()
        #for s in gene_family.sequences:
        #	f_set.add(s.name)
        #f_clusters = []
        #for cl in clusters:
        #	inter = f_set.intersection(cl)
        #	if len(inter) > 0:
        #		f_clusters.append(inter)

        self.clusters_filenames = [join(workdir, "orthomcl.clusters")]
        self.relations_filenames = [join(workdir, "orthomcl.relations")]

        write_clusters(self.clusters_filenames[0], clusters)

        #output relations
        relstr = ""
        seen_keys = {}
        for c in clusters:
            for c1 in c:
                for c2 in c:
                    if c1 != c2:

                        key1 = c1 + ";;" + c2
                        key2 = c2 + ";;" + c1

                        if key1 not in seen_keys and key2 not in seen_keys:
                            seen_keys[key1] = 1
                            seen_keys[key2] = 1
                            relstr += c1 + "\t" + c2 + "\t"

                            sp1 = c1.split(
                                self.other_args["species_separator"])[int(
                                    self.other_args["species_index"])]
                            sp2 = c2.split(
                                self.other_args["species_separator"])[int(
                                    self.other_args["species_index"])]

                            if sp1 != sp2:
                                relstr += "Orthologs"
                            else:
                                relstr += "Paralogs"

                            relstr += ";;"
        relstr = relstr[0:-2]  #extra ;; at end

        for i in range(len(allseqs)):
            for j in range(i + 1, len(allseqs)):
                n1 = allseqs[i].name
                n2 = allseqs[j].name
                key = n1 + ";;" + n2
                if not key in seen_keys:
                    relstr += n1 + "\t" + n2 + "\t" + "Paralogs" + ";;"

        f = open(self.relations_filenames[0], 'w')
        f.write(relstr)
        f.close()