def __init__(self, genedir, seqfiles, maxfails, maxgaps, minoverlap, logger, wd=os.getcwd()): self.wd = wd self.logger = logger self.threads = getThreads(wd) self.maxfails = maxfails # minimum number of fails in a row self.dspp = [] # species dropped self.nseqs = 0 # counter for seqs self.blast_prop = 0.5 # the p sequences a sequence must overlap self.maxgaps = maxgaps self.minoverlap = minoverlap for i, seqfile in enumerate(seqfiles): name = re.sub('\.fasta$', '', seqfile) seqdir = os.path.join(genedir, seqfile) seqs = [] lengths = [] with open(seqdir, "rU") as infile: for record in SeqIO.parse(infile, "fasta"): record.id = name lengths.append(len(record)) seqs.append([record, 0]) # seqrecord + nfails self.nseqs += 1 if len(seqs) > 0: self[name] = [seqs, np.min(lengths)]
def __init__( self, seqstore, maxgaps, minoverlap, minseedsize, maxseedsize, maxtrys, maxseedtrys, gene_type, outgroup, logger, wd=os.getcwd(), ): self.wd = wd self.logger = logger self.threads = getThreads(wd=wd) self.seqstore = seqstore self.maxgaps = maxgaps self.minoverlap = minoverlap self.minseedsize = minseedsize self.maxtrys = 2 # trys for alignment attempts self.buffer = maxseedtrys # trys for a seedsize self.buffer_counter = 0 # seedsize buffer counter self.seedsize = len(seqstore) self.timeout = 99999999 self.talign = False self.tadd = False self.silent = False self.total_trys = 0 # counter for total number of trys self.type = gene_type self.outgroup = outgroup
def __init__(self, seqstore, maxgaps, minoverlap, minseedsize, maxseedsize, maxtrys, maxseedtrys, gene_type, outgroup, logger, wd=os.getcwd()): self.wd = wd self.logger = logger self.threads = getThreads(wd=wd) self.seqstore = seqstore self.maxgaps = maxgaps self.minoverlap = minoverlap self.minseedsize = minseedsize self.maxtrys = 2 # trys for alignment attempts self.buffer = maxseedtrys # trys for a seedsize self.buffer_counter = 0 # seedsize buffer counter self.seedsize = len(seqstore) self.timeout = 99999999 self.talign = False self.tadd = False self.silent = False self.total_trys = 0 # counter for total number of trys self.type = gene_type self.outgroup = outgroup
def getClusters(gene_sequences, minoverlap, logger, wd): """Identify clusters in sequences""" def findClusters(gene_sequences): # blast all against 1 sequences = [e[1] for e in gene_sequences] randi = random.randint(0, len(sequences)-1) bools, _ = atools.blast(sequences, sequences[randi], minoverlap, logger, wd, threads) # how many species had sequences in the cluster? cluster_sequences = [gene_sequences[i] for i, e in enumerate(bools) if e] nspp = len(set([e[0] for e in cluster_sequences])) pspp = float(nspp)/tot_nspp # if more than 50% and 5 species ... if pspp > 0.5 and nspp > 5: # return cluster, remove those sequences from gene_sequences gene_sequences = [gene_sequences[i] for i, e in enumerate(bools) if not e] return cluster_sequences, gene_sequences return None, gene_sequences threads = getThreads(wd=wd) res = [] tot_nspp = len(set([e[0] for e in gene_sequences])) # try max 5 times to get a cluster from randomly selecting a seq for i in range(5): cluster_sequences, gene_sequences = findClusters(gene_sequences) if cluster_sequences: res.append(cluster_sequences) # if gene sequences has not enough seqs left, break pspp = float(len(set([e[0] for e in gene_sequences])))/tot_nspp if pspp < 0.5: break return res
def __init__(self, alignment_store, rttstat, outdir, maxtrys, logger, wd=os.getcwd()): self.logger = logger self.wd = wd self.threads = getThreads(wd=wd) self.threads = self.threads + 1 # RAxML runs a small master process self.trys = 0 self.phylogenies = [] self.maxtrys = maxtrys self.alignment_store = alignment_store self.genes = alignment_store.keys() self.rttstat = rttstat self.outdir = outdir self.taxontree = os.path.join(outdir, "taxontree.tre") self.constraint = os.path.isfile(self.taxontree)
def __init__(self, alignment_store, rttstat, outdir, maxtrys, logger, wd=os.getcwd()): self.logger = logger self.wd = wd self.threads = getThreads(wd=wd) self.threads = self.threads self.trys = self._getTrys(outdir) self.phylogenies = [] self.maxtrys = maxtrys self.alignment_store = alignment_store self.genes = alignment_store.keys() self.rttstat = rttstat self.outdir = outdir self.taxontree = os.path.join('1_names', "taxontree.tre") self.constraint = os.path.isfile(self.taxontree)
def __init__(self, gene_names, nseqs, thoroughness, maxpn, votesize, maxtrys, minoverlap, maxlen, minlen, logger, wd=os.getcwd()): self.wd = wd self.logger = logger self.threads = getThreads(wd=wd) self.gene_names = gene_names self.nseqs = nseqs self.max_thoroughness = thoroughness self.maxpn = maxpn self.votesize = votesize self.maxtrys = maxtrys self.minoverlap = minoverlap self.maxlen = maxlen self.minlen = minlen self.thoroughness = 1 self.deja_vues = [] self.pattern = re.compile("[ACTGactg]")
def calcWorkers(threads, nfolders, min_threads_per_worker=2, max_threads_per_worker=100): """Calculate the number of workers for parallel running of folders""" # get available threads on machine available_threads = getThreads() if available_threads: # make sure threads arg is not greater than those available if threads > available_threads: sys.exit('More threads specified than avaiable on machine') if threads == -1: threads = available_threads # make sure threads is absolute threads = abs(threads) # calc min_threads_per_worker if it is greater than threads if min_threads_per_worker > threads: min_threads_per_worker = threads # calc max_threads_per_worker if it is greater than threads if max_threads_per_worker > threads: max_threads_per_worker = threads # calc nworkers and threads_per_worker # increase workers before threads_per_worker threads_per_worker = min_threads_per_worker for i in range(nfolders): if (float(i) * threads_per_worker) > threads: nworkers = i - 1 break else: nworkers = nfolders for i in range(min_threads_per_worker, max_threads_per_worker): if (float(nworkers) * i) > threads: threads_per_worker = i - 1 break else: threads_per_worker = max_threads_per_worker spare_threads = int(threads - (float(nworkers) * threads_per_worker)) return nworkers, threads_per_worker, spare_threads
def __init__(self, genedir, seqfiles, maxfails, maxgaps, minoverlap, logger, wd=os.getcwd()): self.wd = wd self.logger = logger self.threads = getThreads(wd) self.maxfails = maxfails # minimum number of fails in a row self.dspp = [] # species dropped self.nseqs = 0 # counter for seqs self.blast_prop = 0.5 # the p sequences a sequence must overlap self.maxgaps = maxgaps self.minoverlap = minoverlap for i, seqfile in enumerate(seqfiles): name = re.sub("\.fasta$", "", seqfile) seqdir = os.path.join(genedir, seqfile) seqs = [] lengths = [] with open(seqdir, "rU") as infile: for record in SeqIO.parse(infile, "fasta"): record.id = name lengths.append(len(record)) seqs.append([record, 0]) # seqrecord + nfails self.nseqs += 1 if len(seqs) > 0: self[name] = [seqs, np.min(lengths)]
def calcWorkers(threads, nfolders, min_threads_per_worker=2, max_threads_per_worker=100): """Calculate the number of workers for parallel running of folders""" # get available threads on machine available_threads = getThreads() if available_threads: # make sure threads arg is not greater than those available if threads > available_threads: sys.exit('More threads specified than avaiable on machine') if threads == -1: threads = available_threads # make sure threads is absolute threads = abs(threads) # calc min_threads_per_worker if it is greater than threads if min_threads_per_worker > threads: min_threads_per_worker = threads # calc max_threads_per_worker if it is greater than threads if max_threads_per_worker > threads: max_threads_per_worker = threads # calc nworkers and threads_per_worker # increase workers before threads_per_worker threads_per_worker = min_threads_per_worker for i in range(nfolders): if (float(i)*threads_per_worker) > threads: nworkers = i-1 break else: nworkers = nfolders for i in range(min_threads_per_worker, max_threads_per_worker): if (float(nworkers)*i) > threads: threads_per_worker = i-1 break else: threads_per_worker = max_threads_per_worker spare_threads = int(threads - (float(nworkers)*threads_per_worker)) return nworkers, threads_per_worker, spare_threads