Esempio n. 1
0
def start_app_from_args(args):
    global homDB
    global genomDB
    global xrefDB
    global opDB
    global sorfDB
    global pfamDB
    global tssDB
    global tmpfolder
    global clustalobin

    tmpfolder = args.tmp
    clustalobin = args.clustalo.name

    homDB = HomologyDatabase.loadFromFile(args.databases + "/homdb/" +
                                          "/hpdb_full_new")
    xrefDB = XRefDatabase(gobo=args.databases + "/obos/go.obo",
                          fileName=args.databases + "/homdb/" +
                          "/hpdb_full_xref")
    opDB = OperonDB.from_cs_operons(args.databases + "/sharma/operons.xlsx")
    tssDB = TSSDB.from_cs_tss(args.databases + "/sharma/tss.xlsx")
    sorfDB = SORFDB.from_cs_sorfs(args.databases + "/sharma/sorfs.xlsx")
    pfamDB = PfamResultDB.from_folder(args.databases + "/pfam/")

    genomDB = GenomeDB(args.genomes, loadAll=False)

    for orgname in homDB.get_all_organisms():
        genomDB.loadGenome(orgname)
Esempio n. 2
0
    def __init__(self, basePath, inputFormat="embl", inputExtension='.gb'):

        self.basePath = basePath

        self.genomeInputExtension = inputExtension

        self.genomeDB = GenomeDB(self.basePath,
                                 fileFormat=inputFormat,
                                 fileExtension=inputExtension)
        self.homolDB = HomologyDatabase()
        self.geneDupDB = GeneDuplicationDB()
Esempio n. 3
0
    def load_organism(self, fp, orgGenomeDB=None):

        with open(fp, 'r') as infile:

            genomeID = str(os.path.basename(fp).split(".")[0])

            if orgGenomeDB == None:
                orgGenomeDB = GenomeDB(
                    os.path.dirname(fp) + "../genomes/" + genomeID + ".fa")

            for line in infile:
                ret = DiamondResult.from_line(line, genomeID, genomeID)

                if ret.identity < 0.95:
                    continue

                if ret.subject.seqid == ret.query.seqid:
                    continue

                subjSeq = orgGenomeDB.get_sequence(genomeID, ret.subject.seqid)
                querySeq = orgGenomeDB.get_sequence(genomeID, ret.query.seqid)

                if subjSeq == None or querySeq == None:
                    print("could not find one of the two sequences", genomeID,
                          ret.subject, ret.query)

                partialSQ = (len(subjSeq) / len(querySeq))
                partialQS = (len(querySeq) / len(subjSeq))

                partialSQok = 0.95 < partialSQ and partialSQ < 1.05
                partialQSok = 0.95 < partialQS and partialQS < 1.05

                if not partialQSok and not partialSQok:
                    continue

                self.add_gene_duplication(genomeID, ret.subject.seqid,
                                          ret.query.seqid)
Esempio n. 4
0
from Bio import SeqIO
import argparse

import sys, os
sys.path.insert(0, str(os.path.dirname(os.path.realpath(__file__))) + "/../")

from database.genomedb import GenomeDB

if __name__ == '__main__':

    parser = argparse.ArgumentParser(
        description='Calculate kmer histograms and compare for two groups',
        add_help=False)
    parser.add_argument('-l',
                        '--location',
                        type=str,
                        help='input',
                        required=True)
    args = parser.parse_args()

    fileLocation = args.location

    for file in glob.glob(fileLocation + '/*.gb'):

        print(file)

        genomeDB = GenomeDB(fileLocation, loadAll=False)
        genomeDB.loadGenome(file, False)

        genomeDB.writeBLASTfastas(fileLocation)
                #print(alignment[1])

        for x in orgSubMatrix:
            orgSubMatRel[x] = orgSubMatrix[x] / aaLength

        print("W content")
        for org in orgAACounts:
            print(org, "W", orgAACounts[org]['W'])

        return orgSubMatrix, orgSubMatRel, aaLength, orgSubMatrixDir

    hpHomolDB = HomologyDatabase.loadFromFile(fileLocation + "/hpp12_hp")
    cbHomolDB = HomologyDatabase.loadFromFile(fileLocation + "../cbdb/" +
                                              "/cbj")

    genomeDB = GenomeDB(fileLocation + "/genomes/")

    genomeDB.loadGenome(fileLocation + "/genomes/CP001217.gb")
    genomeDB.loadGenome(fileLocation + "/genomes/AE000511.gb")

    genomeDB.fileExtension = '.gbff'
    genomeDB.fileFormat = 'gb'

    genomeDB.loadGenome(fileLocation + "../cbdb/genomes/NC003912.gbff")
    genomeDB.loadGenome(fileLocation + "../cbdb/genomes/NC002163.gbff")

    matrix = matlist.blosum80

    subMatrix = {}

    print("Starting HPP")
Esempio n. 6
0
import editdistance

from database.genomedb import GenomeDB
from database.homologydb import HomologyDatabase

if __name__ == '__main__':

    genomeLocation = '/home/users/joppich/ownCloud/data/hpyloriDB/genomes/'

    homDB = HomologyDatabase.loadFromFile(
        "/home/proj/projekte/dataintegration/hpyloriDB/hpp12.homdb")
    genDB = GenomeDB(genomeLocation)

    for homGroup in homDB.homologies:

        entries = homDB.homologies[homGroup]

        allSeqs = []

        for seqID in entries:

            if not seqID[0] in genDB.genomes:
                genDB.loadGenome(genomeLocation + "/" + seqID[0] + ".gb")

            seq = genDB.get_sequence(seqID[0], seqID[1])

            allSeqs.append(seq)

        if len(allSeqs) == 0:
            continue
Esempio n. 7
0
class HomologyBuilder:
    def __init__(self, basePath, inputFormat="embl", inputExtension='.gb'):

        self.basePath = basePath

        self.genomeInputExtension = inputExtension

        self.genomeDB = GenomeDB(self.basePath,
                                 fileFormat=inputFormat,
                                 fileExtension=inputExtension)
        self.homolDB = HomologyDatabase()
        self.geneDupDB = GeneDuplicationDB()

    def printResult(self, result):
        qseq = self.genomeDB.get_sequence(result.query.genome,
                                          result.query.seqid)
        sseq = self.genomeDB.get_sequence(result.subject.genome,
                                          result.subject.seqid)

        print(result.query, result.subject, result.identity,
              self.makeScore(result))
        print(len(qseq), qseq)
        print(len(sseq), sseq)

    def makeScore(self, result):

        iden = float(result.identity)

        qseq = self.genomeDB.get_sequence(result.query.genome,
                                          result.query.seqid)
        sseq = self.genomeDB.get_sequence(result.subject.genome,
                                          result.subject.seqid)

        length = (len(result) / len(qseq)) + (len(result) / len(sseq))

        return (4 * iden + length) / 6.0

    def getIDObj(self, edge, vertex):

        diamondResult = edge.props['info']

        if vertex.name == (diamondResult.query.genome,
                           diamondResult.query.seqid):
            return diamondResult.query

        if vertex.name == (diamondResult.subject.genome,
                           diamondResult.subject.seqid):
            return diamondResult.subject

        return None

    def getNonIDObj(self, edge, vertex):

        diamondResult = edge.props['info']

        if vertex.name == (diamondResult.query.genome,
                           diamondResult.query.seqid):
            return diamondResult.subject

        if vertex.name == (diamondResult.subject.genome,
                           diamondResult.subject.seqid):
            return diamondResult.query

        return None

    def analyse(self):

        for file in glob.glob(self.basePath + "/alignments/*.aliout"):

            query2result = defaultdict(list)
            subject2result = defaultdict(list)

            filebase = os.path.basename(file)
            afile = filebase.split('.')
            subjectGenome = afile[0]
            queryGenome = afile[1]

            fileName = filebase

            #wantedGenomes = ['AE000511', 'CP001217', 'AE001439', 'CP001173']

            wantedGenomes = None
            if wantedGenomes != None and not queryGenome in wantedGenomes:
                continue

            if wantedGenomes != None and not subjectGenome in wantedGenomes:
                continue

            if queryGenome == subjectGenome:
                continue

            self.genomeDB.loadGenome(self.basePath + "/genomes/" +
                                     queryGenome + self.genomeInputExtension)
            self.genomeDB.loadGenome(self.basePath + "/genomes/" +
                                     subjectGenome + self.genomeInputExtension)

            dupfiles = [
                self.basePath + "/alignments/" + queryGenome + "." +
                queryGenome + ".aliout", self.basePath + "/alignments/" +
                subjectGenome + "." + subjectGenome + ".aliout"
            ]

            canContinue = True
            for x in dupfiles:
                if not os.path.isfile(x):
                    print("Not a file", x)
                    canContinue = False

            if not canContinue:
                continue

            self.geneDupDB.load_organism(dupfiles[0], self.genomeDB)
            self.geneDupDB.load_organism(dupfiles[1], self.genomeDB)

            #print(str(self.geneDupDB))
            #print(file)

            with open(file, 'r') as infile:

                for line in infile:

                    ret = DiamondResult.from_line(line, queryGenome,
                                                  subjectGenome)

                    if ret == None:
                        continue

                    if self.geneDupDB.has_gene_duplication(
                            ret.query.genome, ret.query.seqid):
                        commonIDs = self.geneDupDB.get_gene_duplication(
                            ret.query.genome, ret.query.seqid)
                        if ret.query.seqid != commonIDs[0]:
                            ret.query.seqid = commonIDs[0]
                            continue

                    if self.geneDupDB.has_gene_duplication(
                            ret.subject.genome, ret.subject.seqid):
                        commonIDs = self.geneDupDB.get_gene_duplication(
                            ret.subject.genome, ret.subject.seqid)
                        if ret.subject.seqid != commonIDs[0]:
                            ret.subject.seqid = commonIDs[0]
                            continue  # todo we expect not to loose anything, but one should check that beforehand ...

                    query2result[ret.query.seqid].append(ret)
                    subject2result[ret.subject.seqid].append(ret)

            for seqid in query2result:
                allResults = query2result[seqid]
                allResults = sorted(allResults,
                                    key=lambda x: self.makeScore(x),
                                    reverse=True)
                query2result[seqid] = allResults

            for seqid in subject2result:
                allResults = subject2result[seqid]
                allResults = sorted(allResults,
                                    key=lambda x: self.makeScore(x),
                                    reverse=True)
                subject2result[seqid] = allResults

            graph = Graph()

            for seqid in query2result:
                results = query2result[seqid]

                if len(results) == 0:
                    continue

                query = results[0].query

                queryVert = Vertex(
                    query.idtuple(), {
                        'sequence':
                        self.genomeDB.get_sequence(query.genome, query.seqid)
                    })
                graph.add_vertex_if_not_exists(queryVert)

                for result in results:

                    if len(result) < 20:
                        continue

                    subjVert = Vertex(
                        result.subject.idtuple(), {
                            'sequence':
                            self.genomeDB.get_sequence(result.subject.genome,
                                                       result.subject.seqid)
                        })
                    subjVert = graph.add_vertex_if_not_exists(subjVert)

                    myedge = graph.add_edge(queryVert, subjVert,
                                            {'info': result}, True)

            for vertexID in graph.vertices:
                vertex = graph.vertices[vertexID]
                vertex.neighbors = sorted(
                    vertex.neighbors,
                    key=lambda x: self.getNonIDObj(x, vertex).seqid)
            """

            STEP 1: REMOVE EMPTY NODES (IF EXIST)

            """
            graphClean = graphCleaner(graph, None)
            graphClean.analyse()
            """

            STEP 2: FIND EASY MATCHES

            """
            oneHitConfig = OneHitHomologsConfig()

            stepOneHits = oneHitHomologs(graph, self.genomeDB, oneHitConfig)
            homolResults = stepOneHits.analyse()
            homolResults.toDataBase(self.homolDB)
            """

            STEP 2.1: multiple hits, but one very high scoring 

            """
            """
            One of multiple. If excellent hit found, only allow that hit.

            """
            one2mulHitsConfig = oneMultipleConfig()
            one2mulHitsConfig.minQueryLength = 0.9
            one2mulHitsConfig.minSubjectLength = 0.9
            one2mulHitsConfig.minIdentity = 0.6
            one2mulHitsConfig.allowMultiple = False

            one2mulHits = oneMultipleHomologs(graph, self.genomeDB,
                                              one2mulHitsConfig)
            homolResults = one2mulHits.analyse()
            homolResults.toDataBase(self.homolDB)
            """
            One of multiple. If excellent hit found, allow that one gene is homologous to many other
            
            """
            one2mulHitsConfig = oneMultipleConfig()
            one2mulHitsConfig.minQueryLength = 0.9
            one2mulHitsConfig.minSubjectLength = 0.9
            one2mulHitsConfig.allowMultiple = True

            one2mulHits = oneMultipleHomologs(graph, self.genomeDB,
                                              one2mulHitsConfig)
            homolResults = one2mulHits.analyse()
            homolResults.toDataBase(self.homolDB)
            """
            
            One of multiple, allow non-excellent hits
            
            """
            one2mulHitsConfig = oneMultipleConfig()
            one2mulHitsConfig.allowMultiple = False

            one2mulHits = oneMultipleHomologs(graph, self.genomeDB,
                                              one2mulHitsConfig)
            homolResults = one2mulHits.analyse()
            homolResults.toDataBase(self.homolDB)
            """

            STEP 3: One sequence, multiple sequences map

            """
            many2one = ManyToOneCombination(graph, self.genomeDB)
            retRes = many2one.analyse()
            retRes.toDataBase(self.homolDB)
            """

            STEP 3.1: try to use a subset to get good coverage!

            """

            greedyConfig = GreedyCombinationConfig()
            greedyConfig.sortingFunctionAssembly = lambda x: x.props['info'
                                                                     ].identity
            greedyCreator = GreedyCombinationCreator(graph, self.genomeDB,
                                                     greedyConfig)
            retRes = greedyCreator.analyse()
            retRes.toDataBase(self.homolDB)

            greedyConfig = GreedyCombinationConfig()
            greedyConfig.minExplainedThreshold = 0.5
            greedyConfig.allowTargetOverlaps = True

            greedyCreator = GreedyCombinationCreator(graph, self.genomeDB,
                                                     greedyConfig)
            retRes = greedyCreator.analyse()
            retRes.toDataBase(self.homolDB)

            greedyConfig = GreedyCombinationConfig()
            greedyConfig.minExplainedThreshold = 0.55

            greedyCreator = GreedyCombinationCreator(graph, self.genomeDB,
                                                     greedyConfig)
            retRes = greedyCreator.analyse()
            retRes.toDataBase(self.homolDB)
            """

            STEP 5: multiple sequences form a cluster

            """

            mulCombAnalysisConfig = MultiCombinationCreatorConfig()
            mulCombAnalysis = MultiCombinationCreator(graph, self.genomeDB,
                                                      mulCombAnalysisConfig)
            mulCombResult = mulCombAnalysis.analyse()
            mulCombResult.toDataBase(self.homolDB)
            """

            STEP 4: one sequence, one or multiple sequences align, accept also rather bad identity

            """

            omConfig = oneMultipleConfig()
            omConfig.allowPartialLength = True
            omConfig.betterEdgeCheck = True
            omConfig.allowMultiple = False
            omConfig.minIdentity = 0.4
            omConfig.minQueryLength = 0.8
            omConfig.minSubjectLength = 0.8

            def checkEdge(config, edge, source, target):

                queryLength = config.get_seq_fraction(edge, source)
                subjectLength = config.get_seq_fraction(edge, target)

                considerEdge = queryLength > config.minQueryLength
                considerEdge = considerEdge and subjectLength > config.minSubjectLength
                considerEdge = considerEdge and edge.props[
                    'info'].identity > config.minIdentity

                considerEdge = considerEdge and min(
                    [queryLength, subjectLength]) > 0.5

                return considerEdge

            omConfig.considerEdgeFunc = checkEdge

            omAnalysis = oneMultipleHomologs(graph, self.genomeDB, omConfig)
            retRes = omAnalysis.analyse()
            retRes.toDataBase(self.homolDB)
            """

            extremely long sequences > 500!

            """

            def checkEdgeLong(config, edge, source, target):

                edgeInfo = edge.props['info']

                minSeqLength = min([
                    len(source.props['sequence']),
                    len(target.props['sequence'])
                ])

                queryLength = config.get_seq_fraction(edge, source)
                subjectLength = config.get_seq_fraction(edge, target)

                if edgeInfo.identity * minSeqLength > 500:

                    if edgeInfo.evalue < math.pow(10, -90):

                        if queryLength > config.minQueryLength and subjectLength > config.minQueryLength:
                            return True

                return False

            omConfig.minQueryLength = 0.6
            omConfig.minSubjectLength = 0.6
            omConfig.considerEdgeFunc = checkEdgeLong

            omAnalysis = oneMultipleHomologs(graph, self.genomeDB, omConfig)
            retRes = omAnalysis.analyse()
            retRes.toDataBase(self.homolDB)
            """

            STEP 6: remove hits which make no sense

            """

            edgeRemover = SpuriousEdgeRemover(graph, self.genomeDB)
            edgeRemover.analyse()
            """
            
            Some relations may have been hidden by combinations
            
            """
            oneHitConfig = OneHitHomologsConfig(minIDScore=0.8,
                                                minLengthScore=0.7)

            stepOneHits = oneHitHomologs(graph, self.genomeDB, oneHitConfig)
            homolResults = stepOneHits.analyse()
            homolResults.toDataBase(self.homolDB)
            """

            STEP 7: Mention leftovers

            """

            def printEdge(edge):
                print(edge.source.name, edge.target.name, edge.props['info'])

            def printGraphEdges(mygraph):

                sortedVerts = sorted(
                    [x for x in mygraph.vertices],
                    key=lambda x: len(mygraph.get_vertex(x).props['sequence']),
                    reverse=True)

                seenDiamondInfos = set()
                for x in sortedVerts:

                    vertex = mygraph.get_vertex(x)

                    for edge in vertex.neighbors:

                        diamondInfo = edge.props.get('info', None)

                        if diamondInfo == None:
                            continue

                        if diamondInfo not in seenDiamondInfos:
                            printEdge(edge)
                            seenDiamondInfos.add(diamondInfo)

            printGraphEdges(graph)

        allDupRelations = self.geneDupDB.get_gene_relations()
        allDupRelations.toDataBase(self.homolDB)

        self.homolDB.save_to_file(self.basePath + "/homdb_pre_finalize")
        self.homolDB.finalize()
        self.homolDB.save_to_file(self.basePath + "/homdb_post_finalize")

        self.genomeDB.writeCSV(self.basePath + "/genome_seqs/seqs")

        return self.homolDB
Esempio n. 8
0
from analysis.homologybuilder import HomologyBuilder
from database.genomedb import GenomeDB

if __name__ == '__main__':
    fileLocation = '/mnt/c/ownCloud/data/cbdb/'


    initialise=False
    if initialise:

        genomDB = GenomeDB(fileLocation + "genomes/", fileFormat='gb', fileExtension='.gbff', loadAll=True)
        genomDB.writeBLASTfastas(fileLocation + "genomes/")

        exit()

    builder = HomologyBuilder(basePath=fileLocation, inputFormat='gb', inputExtension='.gbff')
    homolDB = builder.analyse()
    homolDB.save_to_file(fileLocation + "/cbj")
Esempio n. 9
0
def distance(r1, r2):
    # sort the two ranges such that the range with smaller first element
    # is assigned to x and the bigger one is assigned to y
    x, y = sorted((r1, r2))

    if x[0] <= x[1] < y[0] and all(y[0] <= y[1] for y in (r1, r2)):
        return y[0] - x[1]
    return 0


if __name__ == '__main__':

    fileLocation = "/mnt/c/dev/data/haas/homdb/"

    homDB = HomologyDatabase.loadFromFile(fileLocation + "/hpp_split")
    genomDB = GenomeDB(fileLocation + "/genomes", loadAll=False)

    allorgs = homDB.get_all_organisms()

    for org in allorgs:
        genomDB.loadGenome(org)

    extra = ['AE001439', 'CP009259']
    mc = [
        '4_N1-031C1', '2_N1-025A2', '14_1-20A_UB64', '13_N5-004A1',
        '3_N1-029C1', '11_N4-029C2', '10_N2-085C2', '1_N1-024A1'
    ]
    nmc = [
        x for x in allorgs
        if not x in mc and not x in extra and not x.startswith("6_")
    ]  # and not x.startswith("15")
Esempio n. 10
0
import sys
import os
from collections import defaultdict

sys.path.insert(0, str(os.path.dirname(os.path.realpath(__file__))) + "/../")

import math

from database.genomedb import GenomeDB
from database.homologydb import HomologyDatabase
from utils.utils import fileLocation

if __name__ == '__main__':

    homolDB = HomologyDatabase.loadFromFile(fileLocation + "/hpp12_hp")
    genomeDB = GenomeDB(fileLocation + "/genomes/")

    allowedOrgs = ['CP001217', 'AE000511']

    compareAA = (['W'], ['F', 'G', 'A'])
    compareAA = (['W'], ['H', 'F', 'Y', 'P', 'K'])

    #compareAA = (['W', 'M'], ['H', 'F', 'Y', 'P', 'K'])
    #compareAA = (['W', 'M'], ['F', 'G', 'A'])


    def calculateDifferences(orgI, orgJ, allAA):

        allDiffs = list()
        foundGenes = 0
Esempio n. 11
0
import sys
import os

sys.path.insert(0, str(os.path.dirname(os.path.realpath(__file__))) + "/../")

from database.genomedb import GenomeDB
from utils.utils import fileLocation

if __name__ == '__main__':

    genomeDB = GenomeDB(fileLocation + "/genomes/")

    diffGenePairs = {(('AE000511', 'HP_0868'), ('CP001217', 'HPP12_0868')),
                     (('AE000511', 'HP_0036'), ('CP001217', 'HPP12_0032')),
                     (('AE000511', 'HP_0963'), ('CP001217', 'HPP12_0958')),
                     (('AE000511', 'HP_1282'), ('CP001217', 'HPP12_1248')),
                     (('AE000511', 'HP_0568'), ('CP001217', 'HPP12_0574')),
                     (('AE000511', 'HP_0286'), ('CP001217', 'HPP12_0285')),
                     (('AE000511', 'HP_0519'), ('CP001217', 'HPP12_0525')),
                     (('AE000511', 'HP_0104'), ('CP001217', 'HPP12_0106')),
                     (('AE000511', 'HP_0091'), ('CP001217', 'HPP12_0094')),
                     (('AE000511', 'HP_0342'), ('CP001217', 'HPP12_0337')),
                     (('AE000511', 'HP_0656'), ('CP001217', 'HPP12_0669')),
                     (('AE000511', 'HP_0043'), ('CP001217', 'HPP12_0038')),
                     (('AE000511', 'HP_0108'), ('CP001217', 'HPP12_0110')),
                     (('AE000511', 'HP_1213'), ('CP001217', 'HPP12_1179')),
                     (('AE000511', 'HP_1105'), ('CP001217', 'HPP12_1070')),
                     (('AE000511', 'HP_0661'), ('CP001217', 'HPP12_0674')),
                     (('AE000511', 'HP_0430'), ('CP001217', 'HPP12_0992')),
                     (('AE000511', 'HP_0048'), ('CP001217', 'HPP12_0042')),
                     (('AE000511', 'HP_0860'), ('CP001217', 'HPP12_0860')),
Esempio n. 12
0
                        required=True)
    parser.add_argument('--redo',
                        action='store_true',
                        help='input',
                        default=False)

    restrictOrgs = ['AE001439', 'AE000511', 'CP001217']
    restrictOrgs = None

    args = parser.parse_args()

    print("Loading Hom DB")
    homDB = HomologyDatabase.loadFromFile(args.location.name)

    print("Loading Genomes")
    genomDB = GenomeDB(os.path.dirname(args.location.name) + "/genomes",
                       loadAll=False)

    allorgs = homDB.get_all_organisms()

    if restrictOrgs:
        allorgs = restrictOrgs

    for org in allorgs:
        genomDB.loadGenome(org)

    print("Loading HomDB analyser")
    analyse = HomDBAnalyser(homDB, genomDB, loadAll=False)

    maxNumberEntries = len(allorgs)

    maxAllowedDissimWithinCluster = 0.25
Esempio n. 13
0
from database.genomedb import GenomeDB
from database.homDBAnalyser import HomDBAnalyser
from database.homologydb import HomologyDatabase

if __name__ == '__main__':
    baseDIR = '/mnt/c/dev/data/haas/homdb/'

    genomeDB = GenomeDB(baseDIR + "/genomes", loadAll=False)
    homDB = HomologyDatabase.loadFromFile(baseDIR + "/hpp_comb")

    analyse = HomDBAnalyser(homDB, genomeDB)

    def printHOM(homid):
        print(homid)

        aligned = analyse.cluster_align('HOMID'+str(homid))
        longest = ""
        allseqs = set()

        for rec in sorted(aligned._records, key=lambda x: x.id):

            seq = str(rec.seq).replace('-', '')
            allseqs.add((seq, rec.id))

            if len(seq) > len(longest):
                longest = str(rec.seq).replace('-', '')

            print(rec.seq, rec.id)

        return ('HOMID'+str(homid), longest, set(allseqs))
Esempio n. 14
0
import sys, os
sys.path.insert(
    0,
    str(os.path.dirname(os.path.realpath(__file__))) + "/../../helipyloridb")

from database.genomedb import GenomeDB
from database.homologydb import HomologyDatabase

if __name__ == '__main__':

    fileLocation = "/mnt/c/dev/data/haas/homdb/"

    homDB = HomologyDatabase.loadFromFile(fileLocation + "/hpp_split")
    genomDB = GenomeDB(fileLocation + "/genomes", loadAll=False)

    allorgs = homDB.get_all_organisms()

    extra = ['AE001439', 'CP009259']
    mc = [
        '4_N1-031C1', '2_N1-025A2', '14_1-20A_UB64', '13_N5-004A1',
        '3_N1-029C1', '11_N4-029C2', '10_N2-085C2', '1_N1-024A1'
    ]
    nmc = [
        x for x in allorgs
        if not x in mc and not x in extra and not x.startswith("6_")
    ]  # and not x.startswith("15")

    print("MC", len(mc), mc)
    print("NMC", len(nmc), nmc)

    homlist = []