Ejemplo n.º 1
0
def crossover(altGen, anz):
    neuGen = []
    for i in range(anz):
        paar = rsample(range(len(altGen)), 2)
        kind = erstelleKind(altGen[paar[0]], altGen[paar[1]])
        neuGen.append(kind)
    return neuGen
Ejemplo n.º 2
0
 def create_batch(self):
     if len(self.experience_buffer
            ) >= self.hyperparameters["EXP_BUFFER_SIZE"]:
         experiences = rsample(self.experience_buffer,
                               self.hyperparameters["BATCH_SIZE"])
         return experiences
     return None
Ejemplo n.º 3
0
def experience_replay(batch_size):
    memory = []
    while True:
        experience = (
            yield rsample(memory, batch_size) if batch_size <= len(memory) else None
        )
        memory.append(experience)
Ejemplo n.º 4
0
def generate_ratings(num_types, num_users, ratings_per_user=20, num_items=100,
                     alpha=None, noise=-1, plsi=False):
    p = Poisson(ratings_per_user)
    ratings = [[rint(1,5) for i in range(num_items)] for i in range(num_types)]
    if alpha == None:
        alpha = [1]*num_types
    user_ratings = []
    user_indices = []
    type_dists = []
    for i in range(num_users):
        ratings_per_user = p.sample()
        if plsi:
            type_dist = normalize([rand() for t in range(num_types)])
        else:
            type_dist = dirichlet(alpha)
        type_dists.append(type_dist)
        rating = []
        indices = []
        for j in rsample(range(num_items), ratings_per_user):
            if rand() < noise:
                rating.append(rint(1,5))
            else:
                type = sample(type_dist)
                rating.append(ratings[type][j])
            indices.append(j)
        user_ratings.append(rating)
        user_indices.append(indices)
    user_ratings = user_indices, user_ratings
    
    return user_ratings, ratings, type_dists
Ejemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser(description="kmeans algorithm \
                                     (default values are in parentheses)")
    parser.add_argument('f', metavar="filename", action="store",
                        help="filename of datapoints to be clustered")
    parser.add_argument('-k', action="store", metavar='num clusters', 
                        type=int, default=2, 
                        help="number of clusters to split data into (2)")
    parser.add_argument('-m', action="store", metavar='metric', 
                        default='euclidean', 
                        help='distance metric used for clustering (euclidean)')
    parser.add_argument('-w', action="store", metavar='write directory',
                        help='filename for directory to write cluster labels \
                        and centers (False)')
    parser.add_argument('-i', action="store", metavar='iterations',
                        default=1, type=int,
                        help='number of times to run clustering (1)')
    parser.add_argument('-e', action="store", metavar='empty action',
                        default="singleton",
                        help="action to perform when empty cluster arises \
                        (singleton--OTHER OPTIONS CURRENTLY UNSUPPORTED)")
    parser.add_argument('-t', action="store_true", default=False, 
                        help='flag to use fuzzy clustering (Off)')
    parser.add_argument('-q', action="store_true", default=False,
                        help='flag to plot output (Off)')
    
    args = parser.parse_args()
    
    if args.f == None:
        print "no points supplied: using default random points"
        points = np.array(zip([rsample([1,10], 1)[0] for i in range(15)], 
                              [randint(10, 20) for i in range(15)]))
    else:
        print "loading points from", args.f
        points = get_points(args.f)
        print "points succesfully loaded [ shape:", np.shape(points), "]"
    if args.t:
        type = "fuzzy"
    else:
        type = "normal"
    
    cluster = repeated_kmeans(args.k, points, init="points", metric=args.m, 
                              type=type, iterations=args.i)
    
    if args.q:
        print "suppressing plot of output"
    else:
        cluster.plot()
    
    if args.w != None:
        labels_filename = args.w + '/labels'
        print "writing cluster labels to file:", labels_filename
        cluster.write_labels(labels_filename)
        centers_filename = args.w + '/centers'
        print "writing centers to file: ", centers_filename
        cluster.write_centers(centers_filename)
    
    return points, cluster
Ejemplo n.º 6
0
    def read(self, fsnum: int = 0, fsoffset: int = 0,
             random_addition: bool = True, random_read: bool = True):
        """
        Load files from the given directory 'self.datadir'.

        - If 'fsnum' is 0, all files will be loaded.
        - If 'fsoffset' is 0, all files will be loaded.

        Parameters
        ----------
        fsnum : int, optional
            Number of files to load. The default is 0.
        fsoffset : int, optional
            Offset from the first file to start counting of files.
            The default is 0.
        random_addition : bool, optional
            Read random files from the self.datadir if 'fsnum' is more than files number till the end of these. 
            The default is True.
        random_read : bool, optional
            Read random files, not from the ordered list of files.
            The default is True.

        Returns
        -------
        None.

        """
        if fsnum != 0:
            self.fsnum = fsnum
        if fsoffset != 0:
            self.fsoffset = fsoffset

        datapaths = list_files(
            self.datadir, self.vendor, self.fsoffset, self.fsnum)

        if random_read or (len(datapaths) < self.fsnum and random_addition):
            _datapaths = list_files(self.datadir, self.vendor, 0, 0)
            indxs = np.arange(len(_datapaths))
            rnd_indxs = rsample(
                indxs[indxs != self.fsoffset].tolist(), self.fsnum - 1)
            datapaths = [_datapaths[i] for i in rnd_indxs + [self.fsoffset]]

        # Check if file was processed
        files_processed = [k for k, v in self.rawdata.items() if v is not None]
        files_to_remove = [k for k in self.rawdata if k not in datapaths]

        for path in files_to_remove:
            del self.rawdata[path]

        for path in datapaths:
            if path not in files_processed:
                self.rawdata[path] = None

        if self.memo_file:
            self.rawdata.update(load(self.memo_file, compression='lzma',
                                     set_default_extension=False))

        self.filesnum = len(self.rawdata) if not fsnum else fsnum
Ejemplo n.º 7
0
def experience_replay(batch_size):
    """
    Coroutine of experience replay
    Send in an experience and yield random batch experiences
    """
    mem = []
    while True:
        exp = yield rsample(mem, batch_size) if batch_size <= len(mem) else None
        mem.append(exp)
Ejemplo n.º 8
0
def mutation(Gen,abc,anteil=0.3):
    """
    Fuehrt eine Mutation der Individuen der Generation I:Gen aus dem Alphabet
    I:abc mit einer Wahrscheinlichkeit von I:anteil durch
    """
    auswahl=rsample(range(len(Gen)),int(len(Gen)*anteil))
    for el in auswahl:
        Gen[el]=mutWort(Gen[el],abc)
    return Gen
Ejemplo n.º 9
0
def experience_replay(batch_size):
    """
    Coroutine of experience replay
    Send in an experience and yield random batch experiences
    """
    mem = []
    while True:
        exp = yield rsample(mem, batch_size) if batch_size <= len(mem) else None
        mem.append(exp)
Ejemplo n.º 10
0
def experience_replay(batch_size):
    """
    Coroutine of experience replay.
    
    Provide a new experience by calling send, which in turn yields 
    a random batch of previous replay experiences.
    """
    memory = []
    while True:
        experience = yield rsample(memory, batch_size) if batch_size <= len(memory) else None
        memory.append(experience)
Ejemplo n.º 11
0
def experience_replay(batch_size):
    """
    Coroutine function for implementing experience replay.    
        Provides a new experience by calling "send", which in turn yields 
        a random batch of previous replay experiences.
    """
    memory = []
    while True:
        # experience is a tuple containing (S, action, reward, S_prime)
        experience = yield rsample(memory, batch_size) if batch_size <= len(memory) else None
        memory.append(experience)
def experience_replay(batch_size):
    """
    Coroutine function for implementing experience replay.    
        Provides a new experience by calling "send", which in turn yields 
        a random batch of previous replay experiences.
    """
    memory = []
    while True:
        # experience is a tuple containing (S, action, reward, S_prime)
        experience = yield rsample(memory, batch_size) if batch_size <= len(memory) else None
        memory.append(experience)
Ejemplo n.º 13
0
def experience_replay(batch_size):
    """
    Coroutine of experience replay.
    
    Provide a new experience by calling send, which in turn yields 
    a random batch of previous replay experiences.
    """
    memory = []
    while True:
        experience = yield rsample(memory, batch_size) if batch_size <= len(memory) else None
        memory.append(experience)
Ejemplo n.º 14
0
def homogenize(keymap, tolerance=1):
    '''
	Given a map keymap from ranks to sets of keys assigned to that rank,
	continually take a subset of the largest rank assignment and assign the
	subset to the smallest assignment until the difference in numbers of
	keys between the largest and smallest assignment is no larger than
	tolerance.

	If the same value is associated with more than one key, the value will
	be randomly assigned to one of the associated keys before
	homogenization is performed.
	'''
    if tolerance < 1:
        raise ValueError('Nonuniformity tolerance must be at least 1')

    # Copy the keymap as is
    keymap = {k: set(v) for k, v in keymap.items()}

    # Invert the key map in random order to scramble lists of duplicates
    invkmap = defaultdict(list)
    for k in rsample(list(keymap.keys()), len(keymap)):
        for vv in keymap[k]:
            invkmap[vv].append(k)

    # Remove duplicates
    for v, k in invkmap.items():
        # List of keys is already randomly ordered
        for key in k[1:]:
            try:
                keymap[key].remove(v)
            except KeyError:
                pass

    # Count the assignment for key k
    count = lambda k: len(keymap[k])
    while True:
        # Find the largest and smallest assignments
        largest = max(keymap, key=count, default=None)
        smallest = min(keymap, key=count, default=None)

        if largest is None or smallest is None: break

        lgct = count(largest)
        smct = count(smallest)

        # Terminate if the nonuniformity is close enough
        if lgct - smct <= tolerance: break

        # Reassign half the difference to equalize
        while count(largest) > count(smallest) + tolerance:
            keymap[smallest].add(keymap[largest].pop())

    return keymap
Ejemplo n.º 15
0
def pairwise_distance_nuc_ambig(basepairs, random=None):
    """Calculates pairwise distances between two sequences
        strict = only use ATGC if True,
        choose random heterozygous base if False.
    """
    total = 0
    diff = 0
    if random == 'random2':
        valid_characters = MLIB.validchars['dna+ambig2']
    elif random == 'random3':
        valid_characters = MLIB.validchars['dna+ambig3']
    else:
        return pairwise_distance_strict(basepairs)
    for pairbases, paircount in basepairs.items():
        if any(x not in valid_characters for x in pairbases):
            continue
        base0 = (pairbases[0] if pairbases[0] in 'ATGC' else rsample(
            MLIB.splitbases[pairbases[0]], 1)[0])
        base1 = (pairbases[1] if pairbases[1] in 'ATGC' else rsample(
            MLIB.splitbases[pairbases[1]], 1)[0])
        diff += int(base0 != base1) * paircount
        total += paircount
    return diff, total
Ejemplo n.º 16
0
def erstelleSeq(Aeins, Anull):
    """
    Erstellt eine Sequenz der Laenge I:Aeins aus Einsen und Nullen,
    welche zufaellig auf die Sequenz verteilt werden. Dabei ist die
    Anzahl der Nullen eine zufaellige Zahl zwischen 1 und I:Anull
    """
    if Aeins < Anull:
        print "!!!FALSCHE ANORDNUNG DER WORTE!!!"
    indexe = rsample(range(Aeins), rrandint(1, Anull))

    seq = []
    for i in range(Aeins):
        seq.append(1)

    for el in indexe:
        seq[el] = 0

    return seq
Ejemplo n.º 17
0
 def cluster(self, points, max_iter=np.inf, verbose=False):
     """
     n: number of points
     m: dimension of each point
     """
     self.points = points
     self.n, self.m = np.shape(self.points)
     iterations = 0
     
     if self.init == "points":
         self.centers = rsample(points, self.k)
         self.labels = self.assign()
     elif self.init == "random":
         if self.type == "normal":
             self.labels = np.array([np.random.randint(0, self.k)
                                     for point in self.points])
         elif self.type == "fuzzy":
             labels = []
             for point in self.points:
                 cluster = np.zeros(self.k)
                 cluster[np.random.randint(0, self.k)] += 1
                 labels.append(np.array(cluster))
             self.labels = np.array(labels)
     self.centers = self.update()
     while iterations <= max_iter:
         if iterations % 10 == 0:
             if verbose:
                 print iterations#, self.centers
         new_labels = self.assign()
         if all(new_labels == self.labels) or (np.all(abs(new_labels - 
                                                          self.labels) 
                                               < self.TOLERANCE)):
             if verbose:
                 print "converged in", iterations, "iterations"
             break
         else:
             iterations += 1
             self.labels = new_labels
             self.centers = self.update()
     self.total_distance = self.get_total_distance()
Ejemplo n.º 18
0
def generate_ratings(num_types,
                     num_users,
                     ratings_per_user=20,
                     num_items=100,
                     alpha=None,
                     noise=-1,
                     plsi=False):
    p = Poisson(ratings_per_user)
    ratings = [[rint(1, 5) for i in range(num_items)]
               for i in range(num_types)]
    if alpha == None:
        alpha = [1] * num_types
    user_ratings = []
    user_indices = []
    type_dists = []
    for i in range(num_users):
        ratings_per_user = p.sample()
        if plsi:
            type_dist = normalize([rand() for t in range(num_types)])
        else:
            type_dist = dirichlet(alpha)
        type_dists.append(type_dist)
        rating = []
        indices = []
        for j in rsample(range(num_items), ratings_per_user):
            if rand() < noise:
                rating.append(rint(1, 5))
            else:
                type = sample(type_dist)
                rating.append(ratings[type][j])
            indices.append(j)
        user_ratings.append(rating)
        user_indices.append(indices)
    user_ratings = user_indices, user_ratings

    return user_ratings, ratings, type_dists
Ejemplo n.º 19
0
 def sample(self, n):
     return rsample(self.cards, k=n)
Ejemplo n.º 20
0
def generate_docs(num_topics, num_docs, words_per_doc=50, vocab_size=30,
                  alpha=0.001, beta=0.01, noise=-1, plsi=False):
    """Generates documents according to plsi or lda
    
    Args:
        num_topics: 
            the number of underlying latent topics
        num_docs: 
            the number of documents to generate
        words_per_doc: 
            parameter to a Poisson distribution;
            determines the average words in a documents
        vocab_size: 
            the number of words in the vocabulary
        DIRICHLET PARAMETERS
        ---------------------
        Assumes symmetric dirichlet distributions (ie all elements in the
        parameter vector have the same value)
        ---------------------
        alpha: 
            parameter to dirichlet distribution for topics
        beta: 
            parameter to dirichlet distribution for words
        noise: 
            given as a probability; each word will be replaced with a random
            word with noise probability
        plsi:
            flag to determine which distribution to draw from,
            a random distribution or a sample from a dirichlet distribution
            
    Returns:
        docs:
            the list of documents, each a list of words (represented by their
            indices in range(vocab_size)
        word_dist:
            the distribution over words for each topic; 
            each row is the distribution for a different topic 
        topics_dist:
            the distribution over topics for each document;
            each row is the distribution for a different document
    """
    p = Poisson(words_per_doc)
    
    alpha = [alpha] * num_topics
    beta = [beta] * num_topics

    if plsi:
        word_dist = [normalize([rand() for w in range(vocab_size)])
                     for t in range(num_topics)]
    else:
        word_dist = [dirichlet(beta) for i in range(num_topics)]
    word_cdfs = []
    for topic in word_dist:
        word_cdfs.append(get_cdf(topic))
    
    topic_cdfs = []
    docs = []
    topic_dists = []
    doc_index = 0
    for i in range(num_docs):
        if doc_index % 100 == 0:
            print "reached document", doc_index
        words_per_doc = p.sample()
        doc = []
        if plsi:
            topic_dist = normalize([rand() for t in range(num_topics)])
        else:
            topic_dist = dirichlet(alpha)
        topic_dists.append(topic_dist)
        topic_cdf = get_cdf(topic_dist)
        topic_cdfs.append(topic_cdf)
        for word in range(words_per_doc):
            if rand() < noise:
                doc.append(rsample(range(vocab_size), 1))
            else:
                topic = sample(topic_cdf)
                doc.append(sample(word_cdfs[topic]))
        docs.append(doc)
        doc_index += 1
    return docs, word_dist, topic_dists
Ejemplo n.º 21
0
def generate_docs(num_topics, num_docs, words_per_doc=50, vocab_size=30,
                  alpha=None, beta=None, noise=-1, plsi=False, ctm=False, 
                  pareto=False):
    """Generates documents according to plsi, ctm, or lda
    
    Args:
        num_topics: 
            the number of underlying latent topics
        num_docs: 
            the number of documents to generate
        words_per_doc: 
            parameter to a Poisson distribution;
            determines the average words in a documents
        vocab_size: 
            the number of words in the vocabulary
        DISTRIBUTION PARAMETERS
        ---------------------
        depending on which model, alpha and beta are parameters to different
        distributions
        
        LDA: Assumes symmetric dirichlet distributions (ie all elements in the
        parameter vector have the same value)
            alpha: 
                parameter to dirichlet distribution for topics
            beta: 
                parameter to dirichlet distribution for words
            
        PLSI:
            alpha:
                parameter to poisson distribution to determine the number of
                topics per document (each topic will have uniform
                probability; all other topics will have probability 0)
            beta:
                as alpha, but poisson distribution instead controls the number
                of words per topic (each word will have uniform
                probability; all other words will have probability 0)
        ---------------------
        noise: 
            given as a probability; each word will be replaced with a random
            word with noise probability
        plsi:
            flag to draw distributions according to plsi (ie random 
            distributions)
        ctm:
            flag to draw distributions according to ctm (ie a multivariate
            gaussian distribution) 
        pareto:
            flag to make dirichlet distribution pareto (ie for the dirichlet
            parameter, set each alpha_i = alpha / alpha_i)
            
    Returns:
        docs:
            the list of documents, each a list of words (represented by their
            indices in range(vocab_size)
        topics:
            a list of documents, each a list of topics (represented by their
            indices in range(num_topics)
        word_dist:
            the distribution over words for each topic; 
            each row is the distribution for a different topic 
        topics_dist:
            the distribution over topics for each document;
            each row is the distribution for a different document
    """
    #@TODO: integrate ctm parameters (ie mu and sigma) into alpha and beta
    mu = np.zeros(num_topics)
    sigma = np.ones((num_topics, num_topics))
    
    if plsi and ctm:
        print "plsi and ctm flags cannot both be active (returning None)"
        return None
    
    if not plsi and not ctm:
        if pareto:
            alpha = [alpha / i for i in range(1, num_topics + 1)]
            beta = [np.sqrt(beta / i) for i in range(1, vocab_size + 1)]
            #beta = [beta / i for i in range(1, vocab_size + 1)]
        else:
            alpha = [alpha] * num_topics
            beta = [beta] * vocab_size

    if plsi or ctm:
        sig_words = [rsample(range(vocab_size), util.poisson(beta, vocab_size))\
                     for t in range(num_topics)]
        word_dist = [np.zeros(vocab_size) for t in range(num_topics)]
        for i in range(num_topics):
            word_dist[i][sig_words[i]] = 1.0 / len(sig_words[i])
    else:
        word_dist = [dirichlet(beta) for i in range(num_topics)]
    word_cdfs = []
    for topic in word_dist:
        word_cdfs.append(get_cdf(topic))
    
    topic_cdfs = []
    docs = []
    topics = []
    topic_dists = []
    doc_index = 0
    for i in range(num_docs):
        if doc_index % 100 == 0:
            print "reached document", doc_index
        if plsi:
            sig_topics = rsample(range(num_topics), 
                                 util.poisson(alpha, num_topics))
            topic_dist = np.zeros(num_topics)
            topic_dist[sig_topics] = 1.0 / len(sig_topics)
        elif ctm:
            eta = N(mu, sigma)
            topic_dist = np.exp(eta) / np.sum(np.exp(eta))
        else:
            topic_dist = dirichlet(alpha)
        num_words = util.poisson(words_per_doc)
        doc = []
        topic_dists.append(topic_dist)
        topic_cdf = get_cdf(topic_dist)
        topic_cdfs.append(topic_cdf)
        doc_topics = []
        for word in range(num_words):
            if rand() < noise:
                doc.append(rsample(range(vocab_size), 1))
                doc_topics.append(-1)
            else:
                topic = sample(topic_cdf)
                doc.append(sample(word_cdfs[topic]))
                doc_topics.append(topic)
        docs.append(doc)
        topics.append(doc_topics)
        doc_index += 1
    return docs, topics, word_dist, topic_dists
Ejemplo n.º 22
0
def generate_docs(num_topics,
                  num_docs,
                  words_per_doc=50,
                  vocab_size=30,
                  alpha=0.001,
                  beta=0.01,
                  noise=-1,
                  plsi=False):
    """Generates documents according to plsi or lda
    
    Args:
        num_topics: 
            the number of underlying latent topics
        num_docs: 
            the number of documents to generate
        words_per_doc: 
            parameter to a Poisson distribution;
            determines the average words in a documents
        vocab_size: 
            the number of words in the vocabulary
        DIRICHLET PARAMETERS
        ---------------------
        Assumes symmetric dirichlet distributions (ie all elements in the
        parameter vector have the same value)
        ---------------------
        alpha: 
            parameter to dirichlet distribution for topics
        beta: 
            parameter to dirichlet distribution for words
        noise: 
            given as a probability; each word will be replaced with a random
            word with noise probability
        plsi:
            flag to determine which distribution to draw from,
            a random distribution or a sample from a dirichlet distribution
            
    Returns:
        docs:
            the list of documents, each a list of words (represented by their
            indices in range(vocab_size)
        word_dist:
            the distribution over words for each topic; 
            each row is the distribution for a different topic 
        topics_dist:
            the distribution over topics for each document;
            each row is the distribution for a different document
    """
    p = Poisson(words_per_doc)

    alpha = [alpha] * num_topics
    beta = [beta] * num_topics

    if plsi:
        word_dist = [
            normalize([rand() for w in range(vocab_size)])
            for t in range(num_topics)
        ]
    else:
        word_dist = [dirichlet(beta) for i in range(num_topics)]
    word_cdfs = []
    for topic in word_dist:
        word_cdfs.append(get_cdf(topic))

    topic_cdfs = []
    docs = []
    topic_dists = []
    doc_index = 0
    for i in range(num_docs):
        if doc_index % 100 == 0:
            print "reached document", doc_index
        words_per_doc = p.sample()
        doc = []
        if plsi:
            topic_dist = normalize([rand() for t in range(num_topics)])
        else:
            topic_dist = dirichlet(alpha)
        topic_dists.append(topic_dist)
        topic_cdf = get_cdf(topic_dist)
        topic_cdfs.append(topic_cdf)
        for word in range(words_per_doc):
            if rand() < noise:
                doc.append(rsample(range(vocab_size), 1))
            else:
                topic = sample(topic_cdf)
                doc.append(sample(word_cdfs[topic]))
        docs.append(doc)
        doc_index += 1
    return docs, word_dist, topic_dists