def crossover(altGen, anz): neuGen = [] for i in range(anz): paar = rsample(range(len(altGen)), 2) kind = erstelleKind(altGen[paar[0]], altGen[paar[1]]) neuGen.append(kind) return neuGen
def create_batch(self): if len(self.experience_buffer ) >= self.hyperparameters["EXP_BUFFER_SIZE"]: experiences = rsample(self.experience_buffer, self.hyperparameters["BATCH_SIZE"]) return experiences return None
def experience_replay(batch_size): memory = [] while True: experience = ( yield rsample(memory, batch_size) if batch_size <= len(memory) else None ) memory.append(experience)
def generate_ratings(num_types, num_users, ratings_per_user=20, num_items=100, alpha=None, noise=-1, plsi=False): p = Poisson(ratings_per_user) ratings = [[rint(1,5) for i in range(num_items)] for i in range(num_types)] if alpha == None: alpha = [1]*num_types user_ratings = [] user_indices = [] type_dists = [] for i in range(num_users): ratings_per_user = p.sample() if plsi: type_dist = normalize([rand() for t in range(num_types)]) else: type_dist = dirichlet(alpha) type_dists.append(type_dist) rating = [] indices = [] for j in rsample(range(num_items), ratings_per_user): if rand() < noise: rating.append(rint(1,5)) else: type = sample(type_dist) rating.append(ratings[type][j]) indices.append(j) user_ratings.append(rating) user_indices.append(indices) user_ratings = user_indices, user_ratings return user_ratings, ratings, type_dists
def main(): parser = argparse.ArgumentParser(description="kmeans algorithm \ (default values are in parentheses)") parser.add_argument('f', metavar="filename", action="store", help="filename of datapoints to be clustered") parser.add_argument('-k', action="store", metavar='num clusters', type=int, default=2, help="number of clusters to split data into (2)") parser.add_argument('-m', action="store", metavar='metric', default='euclidean', help='distance metric used for clustering (euclidean)') parser.add_argument('-w', action="store", metavar='write directory', help='filename for directory to write cluster labels \ and centers (False)') parser.add_argument('-i', action="store", metavar='iterations', default=1, type=int, help='number of times to run clustering (1)') parser.add_argument('-e', action="store", metavar='empty action', default="singleton", help="action to perform when empty cluster arises \ (singleton--OTHER OPTIONS CURRENTLY UNSUPPORTED)") parser.add_argument('-t', action="store_true", default=False, help='flag to use fuzzy clustering (Off)') parser.add_argument('-q', action="store_true", default=False, help='flag to plot output (Off)') args = parser.parse_args() if args.f == None: print "no points supplied: using default random points" points = np.array(zip([rsample([1,10], 1)[0] for i in range(15)], [randint(10, 20) for i in range(15)])) else: print "loading points from", args.f points = get_points(args.f) print "points succesfully loaded [ shape:", np.shape(points), "]" if args.t: type = "fuzzy" else: type = "normal" cluster = repeated_kmeans(args.k, points, init="points", metric=args.m, type=type, iterations=args.i) if args.q: print "suppressing plot of output" else: cluster.plot() if args.w != None: labels_filename = args.w + '/labels' print "writing cluster labels to file:", labels_filename cluster.write_labels(labels_filename) centers_filename = args.w + '/centers' print "writing centers to file: ", centers_filename cluster.write_centers(centers_filename) return points, cluster
def read(self, fsnum: int = 0, fsoffset: int = 0, random_addition: bool = True, random_read: bool = True): """ Load files from the given directory 'self.datadir'. - If 'fsnum' is 0, all files will be loaded. - If 'fsoffset' is 0, all files will be loaded. Parameters ---------- fsnum : int, optional Number of files to load. The default is 0. fsoffset : int, optional Offset from the first file to start counting of files. The default is 0. random_addition : bool, optional Read random files from the self.datadir if 'fsnum' is more than files number till the end of these. The default is True. random_read : bool, optional Read random files, not from the ordered list of files. The default is True. Returns ------- None. """ if fsnum != 0: self.fsnum = fsnum if fsoffset != 0: self.fsoffset = fsoffset datapaths = list_files( self.datadir, self.vendor, self.fsoffset, self.fsnum) if random_read or (len(datapaths) < self.fsnum and random_addition): _datapaths = list_files(self.datadir, self.vendor, 0, 0) indxs = np.arange(len(_datapaths)) rnd_indxs = rsample( indxs[indxs != self.fsoffset].tolist(), self.fsnum - 1) datapaths = [_datapaths[i] for i in rnd_indxs + [self.fsoffset]] # Check if file was processed files_processed = [k for k, v in self.rawdata.items() if v is not None] files_to_remove = [k for k in self.rawdata if k not in datapaths] for path in files_to_remove: del self.rawdata[path] for path in datapaths: if path not in files_processed: self.rawdata[path] = None if self.memo_file: self.rawdata.update(load(self.memo_file, compression='lzma', set_default_extension=False)) self.filesnum = len(self.rawdata) if not fsnum else fsnum
def experience_replay(batch_size): """ Coroutine of experience replay Send in an experience and yield random batch experiences """ mem = [] while True: exp = yield rsample(mem, batch_size) if batch_size <= len(mem) else None mem.append(exp)
def mutation(Gen,abc,anteil=0.3): """ Fuehrt eine Mutation der Individuen der Generation I:Gen aus dem Alphabet I:abc mit einer Wahrscheinlichkeit von I:anteil durch """ auswahl=rsample(range(len(Gen)),int(len(Gen)*anteil)) for el in auswahl: Gen[el]=mutWort(Gen[el],abc) return Gen
def experience_replay(batch_size): """ Coroutine of experience replay. Provide a new experience by calling send, which in turn yields a random batch of previous replay experiences. """ memory = [] while True: experience = yield rsample(memory, batch_size) if batch_size <= len(memory) else None memory.append(experience)
def experience_replay(batch_size): """ Coroutine function for implementing experience replay. Provides a new experience by calling "send", which in turn yields a random batch of previous replay experiences. """ memory = [] while True: # experience is a tuple containing (S, action, reward, S_prime) experience = yield rsample(memory, batch_size) if batch_size <= len(memory) else None memory.append(experience)
def homogenize(keymap, tolerance=1): ''' Given a map keymap from ranks to sets of keys assigned to that rank, continually take a subset of the largest rank assignment and assign the subset to the smallest assignment until the difference in numbers of keys between the largest and smallest assignment is no larger than tolerance. If the same value is associated with more than one key, the value will be randomly assigned to one of the associated keys before homogenization is performed. ''' if tolerance < 1: raise ValueError('Nonuniformity tolerance must be at least 1') # Copy the keymap as is keymap = {k: set(v) for k, v in keymap.items()} # Invert the key map in random order to scramble lists of duplicates invkmap = defaultdict(list) for k in rsample(list(keymap.keys()), len(keymap)): for vv in keymap[k]: invkmap[vv].append(k) # Remove duplicates for v, k in invkmap.items(): # List of keys is already randomly ordered for key in k[1:]: try: keymap[key].remove(v) except KeyError: pass # Count the assignment for key k count = lambda k: len(keymap[k]) while True: # Find the largest and smallest assignments largest = max(keymap, key=count, default=None) smallest = min(keymap, key=count, default=None) if largest is None or smallest is None: break lgct = count(largest) smct = count(smallest) # Terminate if the nonuniformity is close enough if lgct - smct <= tolerance: break # Reassign half the difference to equalize while count(largest) > count(smallest) + tolerance: keymap[smallest].add(keymap[largest].pop()) return keymap
def pairwise_distance_nuc_ambig(basepairs, random=None): """Calculates pairwise distances between two sequences strict = only use ATGC if True, choose random heterozygous base if False. """ total = 0 diff = 0 if random == 'random2': valid_characters = MLIB.validchars['dna+ambig2'] elif random == 'random3': valid_characters = MLIB.validchars['dna+ambig3'] else: return pairwise_distance_strict(basepairs) for pairbases, paircount in basepairs.items(): if any(x not in valid_characters for x in pairbases): continue base0 = (pairbases[0] if pairbases[0] in 'ATGC' else rsample( MLIB.splitbases[pairbases[0]], 1)[0]) base1 = (pairbases[1] if pairbases[1] in 'ATGC' else rsample( MLIB.splitbases[pairbases[1]], 1)[0]) diff += int(base0 != base1) * paircount total += paircount return diff, total
def erstelleSeq(Aeins, Anull): """ Erstellt eine Sequenz der Laenge I:Aeins aus Einsen und Nullen, welche zufaellig auf die Sequenz verteilt werden. Dabei ist die Anzahl der Nullen eine zufaellige Zahl zwischen 1 und I:Anull """ if Aeins < Anull: print "!!!FALSCHE ANORDNUNG DER WORTE!!!" indexe = rsample(range(Aeins), rrandint(1, Anull)) seq = [] for i in range(Aeins): seq.append(1) for el in indexe: seq[el] = 0 return seq
def cluster(self, points, max_iter=np.inf, verbose=False): """ n: number of points m: dimension of each point """ self.points = points self.n, self.m = np.shape(self.points) iterations = 0 if self.init == "points": self.centers = rsample(points, self.k) self.labels = self.assign() elif self.init == "random": if self.type == "normal": self.labels = np.array([np.random.randint(0, self.k) for point in self.points]) elif self.type == "fuzzy": labels = [] for point in self.points: cluster = np.zeros(self.k) cluster[np.random.randint(0, self.k)] += 1 labels.append(np.array(cluster)) self.labels = np.array(labels) self.centers = self.update() while iterations <= max_iter: if iterations % 10 == 0: if verbose: print iterations#, self.centers new_labels = self.assign() if all(new_labels == self.labels) or (np.all(abs(new_labels - self.labels) < self.TOLERANCE)): if verbose: print "converged in", iterations, "iterations" break else: iterations += 1 self.labels = new_labels self.centers = self.update() self.total_distance = self.get_total_distance()
def generate_ratings(num_types, num_users, ratings_per_user=20, num_items=100, alpha=None, noise=-1, plsi=False): p = Poisson(ratings_per_user) ratings = [[rint(1, 5) for i in range(num_items)] for i in range(num_types)] if alpha == None: alpha = [1] * num_types user_ratings = [] user_indices = [] type_dists = [] for i in range(num_users): ratings_per_user = p.sample() if plsi: type_dist = normalize([rand() for t in range(num_types)]) else: type_dist = dirichlet(alpha) type_dists.append(type_dist) rating = [] indices = [] for j in rsample(range(num_items), ratings_per_user): if rand() < noise: rating.append(rint(1, 5)) else: type = sample(type_dist) rating.append(ratings[type][j]) indices.append(j) user_ratings.append(rating) user_indices.append(indices) user_ratings = user_indices, user_ratings return user_ratings, ratings, type_dists
def sample(self, n): return rsample(self.cards, k=n)
def generate_docs(num_topics, num_docs, words_per_doc=50, vocab_size=30, alpha=0.001, beta=0.01, noise=-1, plsi=False): """Generates documents according to plsi or lda Args: num_topics: the number of underlying latent topics num_docs: the number of documents to generate words_per_doc: parameter to a Poisson distribution; determines the average words in a documents vocab_size: the number of words in the vocabulary DIRICHLET PARAMETERS --------------------- Assumes symmetric dirichlet distributions (ie all elements in the parameter vector have the same value) --------------------- alpha: parameter to dirichlet distribution for topics beta: parameter to dirichlet distribution for words noise: given as a probability; each word will be replaced with a random word with noise probability plsi: flag to determine which distribution to draw from, a random distribution or a sample from a dirichlet distribution Returns: docs: the list of documents, each a list of words (represented by their indices in range(vocab_size) word_dist: the distribution over words for each topic; each row is the distribution for a different topic topics_dist: the distribution over topics for each document; each row is the distribution for a different document """ p = Poisson(words_per_doc) alpha = [alpha] * num_topics beta = [beta] * num_topics if plsi: word_dist = [normalize([rand() for w in range(vocab_size)]) for t in range(num_topics)] else: word_dist = [dirichlet(beta) for i in range(num_topics)] word_cdfs = [] for topic in word_dist: word_cdfs.append(get_cdf(topic)) topic_cdfs = [] docs = [] topic_dists = [] doc_index = 0 for i in range(num_docs): if doc_index % 100 == 0: print "reached document", doc_index words_per_doc = p.sample() doc = [] if plsi: topic_dist = normalize([rand() for t in range(num_topics)]) else: topic_dist = dirichlet(alpha) topic_dists.append(topic_dist) topic_cdf = get_cdf(topic_dist) topic_cdfs.append(topic_cdf) for word in range(words_per_doc): if rand() < noise: doc.append(rsample(range(vocab_size), 1)) else: topic = sample(topic_cdf) doc.append(sample(word_cdfs[topic])) docs.append(doc) doc_index += 1 return docs, word_dist, topic_dists
def generate_docs(num_topics, num_docs, words_per_doc=50, vocab_size=30, alpha=None, beta=None, noise=-1, plsi=False, ctm=False, pareto=False): """Generates documents according to plsi, ctm, or lda Args: num_topics: the number of underlying latent topics num_docs: the number of documents to generate words_per_doc: parameter to a Poisson distribution; determines the average words in a documents vocab_size: the number of words in the vocabulary DISTRIBUTION PARAMETERS --------------------- depending on which model, alpha and beta are parameters to different distributions LDA: Assumes symmetric dirichlet distributions (ie all elements in the parameter vector have the same value) alpha: parameter to dirichlet distribution for topics beta: parameter to dirichlet distribution for words PLSI: alpha: parameter to poisson distribution to determine the number of topics per document (each topic will have uniform probability; all other topics will have probability 0) beta: as alpha, but poisson distribution instead controls the number of words per topic (each word will have uniform probability; all other words will have probability 0) --------------------- noise: given as a probability; each word will be replaced with a random word with noise probability plsi: flag to draw distributions according to plsi (ie random distributions) ctm: flag to draw distributions according to ctm (ie a multivariate gaussian distribution) pareto: flag to make dirichlet distribution pareto (ie for the dirichlet parameter, set each alpha_i = alpha / alpha_i) Returns: docs: the list of documents, each a list of words (represented by their indices in range(vocab_size) topics: a list of documents, each a list of topics (represented by their indices in range(num_topics) word_dist: the distribution over words for each topic; each row is the distribution for a different topic topics_dist: the distribution over topics for each document; each row is the distribution for a different document """ #@TODO: integrate ctm parameters (ie mu and sigma) into alpha and beta mu = np.zeros(num_topics) sigma = np.ones((num_topics, num_topics)) if plsi and ctm: print "plsi and ctm flags cannot both be active (returning None)" return None if not plsi and not ctm: if pareto: alpha = [alpha / i for i in range(1, num_topics + 1)] beta = [np.sqrt(beta / i) for i in range(1, vocab_size + 1)] #beta = [beta / i for i in range(1, vocab_size + 1)] else: alpha = [alpha] * num_topics beta = [beta] * vocab_size if plsi or ctm: sig_words = [rsample(range(vocab_size), util.poisson(beta, vocab_size))\ for t in range(num_topics)] word_dist = [np.zeros(vocab_size) for t in range(num_topics)] for i in range(num_topics): word_dist[i][sig_words[i]] = 1.0 / len(sig_words[i]) else: word_dist = [dirichlet(beta) for i in range(num_topics)] word_cdfs = [] for topic in word_dist: word_cdfs.append(get_cdf(topic)) topic_cdfs = [] docs = [] topics = [] topic_dists = [] doc_index = 0 for i in range(num_docs): if doc_index % 100 == 0: print "reached document", doc_index if plsi: sig_topics = rsample(range(num_topics), util.poisson(alpha, num_topics)) topic_dist = np.zeros(num_topics) topic_dist[sig_topics] = 1.0 / len(sig_topics) elif ctm: eta = N(mu, sigma) topic_dist = np.exp(eta) / np.sum(np.exp(eta)) else: topic_dist = dirichlet(alpha) num_words = util.poisson(words_per_doc) doc = [] topic_dists.append(topic_dist) topic_cdf = get_cdf(topic_dist) topic_cdfs.append(topic_cdf) doc_topics = [] for word in range(num_words): if rand() < noise: doc.append(rsample(range(vocab_size), 1)) doc_topics.append(-1) else: topic = sample(topic_cdf) doc.append(sample(word_cdfs[topic])) doc_topics.append(topic) docs.append(doc) topics.append(doc_topics) doc_index += 1 return docs, topics, word_dist, topic_dists
def generate_docs(num_topics, num_docs, words_per_doc=50, vocab_size=30, alpha=0.001, beta=0.01, noise=-1, plsi=False): """Generates documents according to plsi or lda Args: num_topics: the number of underlying latent topics num_docs: the number of documents to generate words_per_doc: parameter to a Poisson distribution; determines the average words in a documents vocab_size: the number of words in the vocabulary DIRICHLET PARAMETERS --------------------- Assumes symmetric dirichlet distributions (ie all elements in the parameter vector have the same value) --------------------- alpha: parameter to dirichlet distribution for topics beta: parameter to dirichlet distribution for words noise: given as a probability; each word will be replaced with a random word with noise probability plsi: flag to determine which distribution to draw from, a random distribution or a sample from a dirichlet distribution Returns: docs: the list of documents, each a list of words (represented by their indices in range(vocab_size) word_dist: the distribution over words for each topic; each row is the distribution for a different topic topics_dist: the distribution over topics for each document; each row is the distribution for a different document """ p = Poisson(words_per_doc) alpha = [alpha] * num_topics beta = [beta] * num_topics if plsi: word_dist = [ normalize([rand() for w in range(vocab_size)]) for t in range(num_topics) ] else: word_dist = [dirichlet(beta) for i in range(num_topics)] word_cdfs = [] for topic in word_dist: word_cdfs.append(get_cdf(topic)) topic_cdfs = [] docs = [] topic_dists = [] doc_index = 0 for i in range(num_docs): if doc_index % 100 == 0: print "reached document", doc_index words_per_doc = p.sample() doc = [] if plsi: topic_dist = normalize([rand() for t in range(num_topics)]) else: topic_dist = dirichlet(alpha) topic_dists.append(topic_dist) topic_cdf = get_cdf(topic_dist) topic_cdfs.append(topic_cdf) for word in range(words_per_doc): if rand() < noise: doc.append(rsample(range(vocab_size), 1)) else: topic = sample(topic_cdf) doc.append(sample(word_cdfs[topic])) docs.append(doc) doc_index += 1 return docs, word_dist, topic_dists