Exemple #1
0
 def __init__(self,
              genedir,
              seqfiles,
              maxfails,
              maxgaps,
              minoverlap,
              logger,
              wd=os.getcwd()):
     self.wd = wd
     self.logger = logger
     self.threads = getThreads(wd)
     self.maxfails = maxfails  # minimum number of fails in a row
     self.dspp = []  # species dropped
     self.nseqs = 0  # counter for seqs
     self.blast_prop = 0.5  # the p sequences a sequence must overlap
     self.maxgaps = maxgaps
     self.minoverlap = minoverlap
     for i, seqfile in enumerate(seqfiles):
         name = re.sub('\.fasta$', '', seqfile)
         seqdir = os.path.join(genedir, seqfile)
         seqs = []
         lengths = []
         with open(seqdir, "rU") as infile:
             for record in SeqIO.parse(infile, "fasta"):
                 record.id = name
                 lengths.append(len(record))
                 seqs.append([record, 0])  # seqrecord + nfails
                 self.nseqs += 1
         if len(seqs) > 0:
             self[name] = [seqs, np.min(lengths)]
 def __init__(
     self,
     seqstore,
     maxgaps,
     minoverlap,
     minseedsize,
     maxseedsize,
     maxtrys,
     maxseedtrys,
     gene_type,
     outgroup,
     logger,
     wd=os.getcwd(),
 ):
     self.wd = wd
     self.logger = logger
     self.threads = getThreads(wd=wd)
     self.seqstore = seqstore
     self.maxgaps = maxgaps
     self.minoverlap = minoverlap
     self.minseedsize = minseedsize
     self.maxtrys = 2  # trys for alignment attempts
     self.buffer = maxseedtrys  # trys for a seedsize
     self.buffer_counter = 0  # seedsize buffer counter
     self.seedsize = len(seqstore)
     self.timeout = 99999999
     self.talign = False
     self.tadd = False
     self.silent = False
     self.total_trys = 0  # counter for total number of trys
     self.type = gene_type
     self.outgroup = outgroup
Exemple #3
0
 def __init__(self,
              seqstore,
              maxgaps,
              minoverlap,
              minseedsize,
              maxseedsize,
              maxtrys,
              maxseedtrys,
              gene_type,
              outgroup,
              logger,
              wd=os.getcwd()):
     self.wd = wd
     self.logger = logger
     self.threads = getThreads(wd=wd)
     self.seqstore = seqstore
     self.maxgaps = maxgaps
     self.minoverlap = minoverlap
     self.minseedsize = minseedsize
     self.maxtrys = 2  # trys for alignment attempts
     self.buffer = maxseedtrys  # trys for a seedsize
     self.buffer_counter = 0  # seedsize buffer counter
     self.seedsize = len(seqstore)
     self.timeout = 99999999
     self.talign = False
     self.tadd = False
     self.silent = False
     self.total_trys = 0  # counter for total number of trys
     self.type = gene_type
     self.outgroup = outgroup
Exemple #4
0
def getClusters(gene_sequences, minoverlap, logger, wd):
    """Identify clusters in sequences"""
    def findClusters(gene_sequences):
        # blast all against 1
        sequences = [e[1] for e in gene_sequences]
        randi = random.randint(0, len(sequences)-1)
        bools, _ = atools.blast(sequences, sequences[randi], minoverlap,
                                logger, wd, threads)
        # how many species had sequences in the cluster?
        cluster_sequences = [gene_sequences[i] for i, e in enumerate(bools)
                             if e]
        nspp = len(set([e[0] for e in cluster_sequences]))
        pspp = float(nspp)/tot_nspp
        # if more than 50% and 5 species ...
        if pspp > 0.5 and nspp > 5:
            # return cluster, remove those sequences from gene_sequences
            gene_sequences = [gene_sequences[i] for i, e in enumerate(bools)
                              if not e]
            return cluster_sequences, gene_sequences
        return None, gene_sequences
    threads = getThreads(wd=wd)
    res = []
    tot_nspp = len(set([e[0] for e in gene_sequences]))
    # try max 5 times to get a cluster from randomly selecting a seq
    for i in range(5):
        cluster_sequences, gene_sequences = findClusters(gene_sequences)
        if cluster_sequences:
            res.append(cluster_sequences)
        # if gene sequences has not enough seqs left, break
        pspp = float(len(set([e[0] for e in gene_sequences])))/tot_nspp
        if pspp < 0.5:
            break
    return res
 def __init__(self, alignment_store, rttstat, outdir, maxtrys, logger,
              wd=os.getcwd()):
     self.logger = logger
     self.wd = wd
     self.threads = getThreads(wd=wd)
     self.threads = self.threads + 1  # RAxML runs a small master process
     self.trys = 0
     self.phylogenies = []
     self.maxtrys = maxtrys
     self.alignment_store = alignment_store
     self.genes = alignment_store.keys()
     self.rttstat = rttstat
     self.outdir = outdir
     self.taxontree = os.path.join(outdir, "taxontree.tre")
     self.constraint = os.path.isfile(self.taxontree)
Exemple #6
0
 def __init__(self, alignment_store, rttstat, outdir, maxtrys, logger,
              wd=os.getcwd()):
     self.logger = logger
     self.wd = wd
     self.threads = getThreads(wd=wd)
     self.threads = self.threads
     self.trys = self._getTrys(outdir)
     self.phylogenies = []
     self.maxtrys = maxtrys
     self.alignment_store = alignment_store
     self.genes = alignment_store.keys()
     self.rttstat = rttstat
     self.outdir = outdir
     self.taxontree = os.path.join('1_names', "taxontree.tre")
     self.constraint = os.path.isfile(self.taxontree)
Exemple #7
0
 def __init__(self, gene_names, nseqs, thoroughness, maxpn, votesize,
              maxtrys, minoverlap, maxlen, minlen, logger, wd=os.getcwd()):
     self.wd = wd
     self.logger = logger
     self.threads = getThreads(wd=wd)
     self.gene_names = gene_names
     self.nseqs = nseqs
     self.max_thoroughness = thoroughness
     self.maxpn = maxpn
     self.votesize = votesize
     self.maxtrys = maxtrys
     self.minoverlap = minoverlap
     self.maxlen = maxlen
     self.minlen = minlen
     self.thoroughness = 1
     self.deja_vues = []
     self.pattern = re.compile("[ACTGactg]")
Exemple #8
0
def calcWorkers(threads,
                nfolders,
                min_threads_per_worker=2,
                max_threads_per_worker=100):
    """Calculate the number of workers for parallel running of folders"""
    # get available threads on machine
    available_threads = getThreads()
    if available_threads:
        # make sure threads arg is not greater than those available
        if threads > available_threads:
            sys.exit('More threads specified than avaiable on machine')
        if threads == -1:
            threads = available_threads
    # make sure threads is absolute
    threads = abs(threads)
    # calc min_threads_per_worker if it is greater than threads
    if min_threads_per_worker > threads:
        min_threads_per_worker = threads
    # calc max_threads_per_worker if it is greater than threads
    if max_threads_per_worker > threads:
        max_threads_per_worker = threads
    # calc nworkers and threads_per_worker
    # increase workers before threads_per_worker
    threads_per_worker = min_threads_per_worker
    for i in range(nfolders):
        if (float(i) * threads_per_worker) > threads:
            nworkers = i - 1
            break
    else:
        nworkers = nfolders
        for i in range(min_threads_per_worker, max_threads_per_worker):
            if (float(nworkers) * i) > threads:
                threads_per_worker = i - 1
                break
        else:
            threads_per_worker = max_threads_per_worker
    spare_threads = int(threads - (float(nworkers) * threads_per_worker))
    return nworkers, threads_per_worker, spare_threads
 def __init__(self, genedir, seqfiles, maxfails, maxgaps, minoverlap, logger, wd=os.getcwd()):
     self.wd = wd
     self.logger = logger
     self.threads = getThreads(wd)
     self.maxfails = maxfails  # minimum number of fails in a row
     self.dspp = []  # species dropped
     self.nseqs = 0  # counter for seqs
     self.blast_prop = 0.5  # the p sequences a sequence must overlap
     self.maxgaps = maxgaps
     self.minoverlap = minoverlap
     for i, seqfile in enumerate(seqfiles):
         name = re.sub("\.fasta$", "", seqfile)
         seqdir = os.path.join(genedir, seqfile)
         seqs = []
         lengths = []
         with open(seqdir, "rU") as infile:
             for record in SeqIO.parse(infile, "fasta"):
                 record.id = name
                 lengths.append(len(record))
                 seqs.append([record, 0])  # seqrecord + nfails
                 self.nseqs += 1
         if len(seqs) > 0:
             self[name] = [seqs, np.min(lengths)]
Exemple #10
0
def calcWorkers(threads, nfolders, min_threads_per_worker=2,
                max_threads_per_worker=100):
    """Calculate the number of workers for parallel running of folders"""
    # get available threads on machine
    available_threads = getThreads()
    if available_threads:
        # make sure threads arg is not greater than those available
        if threads > available_threads:
            sys.exit('More threads specified than avaiable on machine')
        if threads == -1:
            threads = available_threads
    # make sure threads is absolute
    threads = abs(threads)
    # calc min_threads_per_worker if it is greater than threads
    if min_threads_per_worker > threads:
        min_threads_per_worker = threads
    # calc max_threads_per_worker if it is greater than threads
    if max_threads_per_worker > threads:
        max_threads_per_worker = threads
    # calc nworkers and threads_per_worker
    # increase workers before threads_per_worker
    threads_per_worker = min_threads_per_worker
    for i in range(nfolders):
        if (float(i)*threads_per_worker) > threads:
            nworkers = i-1
            break
    else:
        nworkers = nfolders
        for i in range(min_threads_per_worker, max_threads_per_worker):
            if (float(nworkers)*i) > threads:
                threads_per_worker = i-1
                break
        else:
            threads_per_worker = max_threads_per_worker
    spare_threads = int(threads - (float(nworkers)*threads_per_worker))
    return nworkers, threads_per_worker, spare_threads