def estimate(predictions_rgb, predictions_pcloud, weights=(0.6, 0.4)): """Return ids of object that appears more times in each matcher. That is, returns the id of the item with higher frequency in the rgb predictions, the point cloud predictions and the item with higer frequency in a weighted sum of rgb and pcloud preds. Args: predictions_rgb: list of predictions of the RGB matcher predictions_pcloud: list of predictions of the point cloud matcher weights: tuple to indicate relative weights of the matchers. Default=(0.6, 0.4) Return: tuple with (id_of_weighted_sum, id_rgb, id_pcloud) Example: >>> estimate([1, 1, 1, 2, 2], [1, 0, 2, 2, 1]) (1, 1, 1) >>> estimate([1, 1, 1, 2, 2], [1, 0, 2, 2, 2]) (2, 1, 2) >>> estimate([1, 1, 1, 2, 2], [1, 0, 0, 0, 2]) (1, 1, 0) >>> estimate([1, 1, 1, 2, 2], [2, 2, 2, 2, 2], weights=(1,0)) (1, 1, 2) """ w_rgb, w_pcloud = weights freqs_rgb = pd.Series(frequencies(predictions_rgb)) freqs_pcloud = pd.Series(frequencies(predictions_pcloud)) freqs = pd.Series.add(w_rgb * freqs_rgb, w_pcloud * freqs_pcloud, fill_value=0) return (freqs.nlargest(1).index[0], freqs_rgb.nlargest(1).index[0], freqs_pcloud.nlargest(1).index[0])
def test_min_max(): loop = IOLoop.current() cluster = yield LocalCluster( 0, scheduler_port=0, silence_logs=False, processes=False, dashboard_address=None, loop=loop, asynchronous=True, ) yield cluster._start() try: adapt = Adaptive( cluster.scheduler, cluster, minimum=1, maximum=2, interval="20 ms", wait_count=10, ) c = yield Client(cluster, asynchronous=True, loop=loop) start = time() while not cluster.scheduler.workers: yield gen.sleep(0.01) assert time() < start + 1 yield gen.sleep(0.2) assert len(cluster.scheduler.workers) == 1 assert frequencies(pluck(1, adapt.log)) == {"up": 1} futures = c.map(slowinc, range(100), delay=0.1) start = time() while len(cluster.scheduler.workers) < 2: yield gen.sleep(0.01) assert time() < start + 1 assert len(cluster.scheduler.workers) == 2 yield gen.sleep(0.5) assert len(cluster.scheduler.workers) == 2 assert len(cluster.workers) == 2 assert frequencies(pluck(1, adapt.log)) == {"up": 2} del futures start = time() while len(cluster.scheduler.workers) != 1: yield gen.sleep(0.01) assert time() < start + 2 assert frequencies(pluck(1, adapt.log)) == {"up": 2, "down": 1} finally: yield c.close() yield cluster.close()
def _make_formula_dict(self): """From the list of atoms, form a dictionary where the keys are the element symbols and the values are the number of atoms of that element. """ return frequencies(self.atoms)
def merge(*exprs, **kwargs): if len(exprs) + len(kwargs) == 1: # we only have one object so don't need to construct a merge if exprs: # we only have a positional argumnent, return it unchanged return exprs[0] if kwargs: # we only have a single keyword argument, label it and return it [(k, v)] = kwargs.items() return v.label(k) # label all the kwargs and sort in key order exprs = tuple( concatv( (_wrap(expr, '_%s' % n) for n, expr in enumerate(exprs)), (label(_wrap(v, k), k) for k, v in sorted(kwargs.items(), key=first)), )) if all(ndim(expr) == 0 for expr in exprs): raise TypeError('cannot merge all scalar expressions') result = Merge( exprs, varargsexpr(exprs), maxshape(map(shape, exprs)), ) if not isdistinct(result.fields): raise ValueError( "Repeated columns found: " + ', '.join( k for k, v in frequencies(result.fields).items() if v > 1), ) return result
def test_TaskStreamPlugin(c, s, *workers): es = TaskStreamPlugin(s) assert not es.buffer futures = c.map(div, [1] * 10, range(10)) total = c.submit(sum, futures[1:]) yield wait(total) assert len(es.buffer) == 11 workers = dict() rects = es.rectangles(0, 10, workers) assert workers assert all(n == 'div' for n in rects['name']) assert all(d > 0 for d in rects['duration']) counts = frequencies(rects['color']) assert counts['black'] == 1 assert set(counts.values()) == {9, 1} assert len(set(rects['y'])) == 3 rects = es.rectangles(2, 5, workers) assert all(len(L) == 3 for L in rects.values()) starts = sorted(rects['start']) rects = es.rectangles(2, 5, workers=workers, start_boundary=(starts[0] + starts[1]) / 2000) assert set(rects['start']).issubset(set(starts[1:]))
def test_avoid_churn(): """ We want to avoid creating and deleting workers frequently Instead we want to wait a few beats before removing a worker in case the user is taking a brief pause between work """ cluster = yield LocalCluster( 0, asynchronous=True, processes=False, scheduler_port=0, silence_logs=False, dashboard_address=None, ) client = yield Client(cluster, asynchronous=True) try: adapt = Adaptive(cluster.scheduler, cluster, interval="20 ms", wait_count=5) for i in range(10): yield client.submit(slowinc, i, delay=0.040) yield gen.sleep(0.040) assert frequencies(pluck(1, adapt.log)) == {"up": 1} finally: yield client.close() yield cluster.close()
def test_TaskStreamPlugin(c, s, *workers): es = TaskStreamPlugin(s) assert not es.buffer futures = c.map(div, [1] * 10, range(10)) total = c.submit(sum, futures[1:]) yield _wait(total) assert len(es.buffer) == 11 workers = dict() rects = es.rectangles(0, 10, workers) assert all(n == 'div' for n in rects['name']) assert all(d > 0 for d in rects['duration']) counts = frequencies(rects['color']) assert counts['black'] == 1 assert set(counts.values()) == {9, 1} assert len(set(rects['y'])) == 3 rects = es.rectangles(2, 5, workers) assert all(len(L) == 3 for L in rects.values()) starts = sorted(rects['start']) rects = es.rectangles(2, 5, workers=workers, start_boundary=(starts[0] + starts[1]) / 2000) assert set(rects['start']).issubset(set(starts[1:]))
def filter_columns(self, data, headers): """ Drop columns that meet drop criteria, unless they have been explicitly selected. """ drop = set(self.drop) select_patterns = [ re.compile(pattern, re.I) for pattern in self.select ] select = len(select_patterns) > 0 headers_out = [] columns_out = [] for header, column in zip(headers, zip(*data)): if select: for pattern in select_patterns: if pattern.search(header): headers_out.append(header) columns_out.append(column) else: freqs = frequencies(column) if not set(freqs.keys()).issubset(drop): headers_out.append(header) columns_out.append(column) rows_out = list(zip(*columns_out)) return rows_out, headers_out
def run(filename, vocab='default', vocab_ans='default'): #Read the data questions, answers = readFile(filename) #Create DataFrame from the lists df1, df3 = seperateImages(questions) df1 = pd.DataFrame(df1, columns=['question']) df2 = pd.DataFrame(answers, columns=['answer']) df3 = pd.DataFrame(df3, columns=['images']) #Concatanate dataframes along their rows, and construct the final version of the dataframe frames = [df1, df2, df3] data = pd.concat(frames, axis=1) #Check the first few elements for correctness data.head() #Count frequencies freqs_que = frequencies(' '.join(data['question']).split(' ')) freqs_ans = frequencies(' '.join(data['answer']).split(' ')) #Initialize the vocabulary - For test data, the training vocabulary is used. That's why the 'default' keyword is used. if vocab == 'default': vocabulary_que = createVocabulary(freqs_que) else: vocabulary_que = vocab if vocab_ans == 'default': vocabulary_ans = createVocabularyAnswers(freqs_ans) else: vocabulary_ans = vocab_ans #Encode question into integer vectors encoded_questions = encodeQuestions(data['question'], vocabulary_que) #Encode answers into one-hot vectors encoded_answers = encodeAnswers(data['answer'], vocabulary_ans) #Pad the questions into uniform length padded_questions = sequencePad(encoded_questions) #padded_answers = sequencePad(encoded_answers,MAXLEN = 2) return [ df3, padded_questions, encoded_answers, vocabulary_que, vocabulary_ans ]
def _check_dsk(dsk): """ Check that graph is well named and non-overlapping """ if not isinstance(dsk, HighLevelGraph): return assert all(isinstance(k, (tuple, str)) for k in dsk.layers) freqs = frequencies(concat(dsk.dicts.values())) non_one = {k: v for k, v in freqs.items() if v != 1} assert not non_one, non_one
def test_min_max(): loop = IOLoop.current() cluster = yield LocalCluster(0, scheduler_port=0, silence_logs=False, processes=False, diagnostics_port=None, loop=loop, asynchronous=True) yield cluster._start() try: adapt = Adaptive(cluster.scheduler, cluster, minimum=1, maximum=2, interval='20 ms', wait_count=10) c = yield Client(cluster, asynchronous=True, loop=loop) start = time() while not cluster.scheduler.workers: yield gen.sleep(0.01) assert time() < start + 1 yield gen.sleep(0.2) assert len(cluster.scheduler.workers) == 1 assert frequencies(pluck(1, adapt.log)) == {'up': 1} futures = c.map(slowinc, range(100), delay=0.1) start = time() while len(cluster.scheduler.workers) < 2: yield gen.sleep(0.01) assert time() < start + 1 assert len(cluster.scheduler.workers) == 2 yield gen.sleep(0.5) assert len(cluster.scheduler.workers) == 2 assert len(cluster.workers) == 2 assert frequencies(pluck(1, adapt.log)) == {'up': 2} del futures start = time() while len(cluster.scheduler.workers) != 1: yield gen.sleep(0.01) assert time() < start + 2 assert frequencies(pluck(1, adapt.log)) == {'up': 2, 'down': 1} finally: yield c.close() yield cluster.close()
def classify(filenames): langs = [] for filename in filenames: with open(filename, 'rb') as f: langs.append(np.loadtxt(f, delimiter=',', skiprows=1)) gmms = [] for l in langs: g = GMM(n_components = 4, covariance_type='full') g.fit(l) gmms.append(g) all_data = np.row_stack(langs) for i in xrange(len(gmms)): g = gmms[i] l = langs[i] pred = [x > math.log(0.5) for x in g.score(all_data)] expected = [np.any(np.equal(l,x).all(1)) for x in all_data] return t.frequencies(expected), t.frequencies(zip(expected,pred))
def test_adapt_quickly(): """ We want to avoid creating and deleting workers frequently Instead we want to wait a few beats before removing a worker in case the user is taking a brief pause between work """ cluster = yield LocalCluster( 0, asynchronous=True, processes=False, scheduler_port=0, silence_logs=False, dashboard_address=None, ) client = yield Client(cluster, asynchronous=True) adapt = Adaptive(cluster.scheduler, cluster, interval=20, wait_count=5, maximum=10) try: future = client.submit(slowinc, 1, delay=0.100) yield wait(future) assert len(adapt.log) == 1 # Scale up when there is plenty of available work futures = client.map(slowinc, range(1000), delay=0.100) while frequencies(pluck(1, adapt.log)) == {"up": 1}: yield gen.sleep(0.01) assert len(adapt.log) == 2 assert "up" in adapt.log[-1] d = [x for x in adapt.log[-1] if isinstance(x, dict)][0] assert 2 < d["n"] <= adapt.maximum while len(cluster.scheduler.workers) < adapt.maximum: yield gen.sleep(0.01) del futures while len(cluster.scheduler.workers) > 1: yield gen.sleep(0.01) # Don't scale up for large sequential computations x = yield client.scatter(1) for i in range(100): x = client.submit(slowinc, x) yield gen.sleep(0.1) assert len(cluster.scheduler.workers) == 1 finally: yield client.close() yield cluster.close()
def find_corners(self) -> List[int]: edges = {} edges_dict_list = [{ min(edge, edge[::-1]): tile.id for edge in tile.edges } for tile in self.tiles] edges = merge_with(lambda x: x, *edges_dict_list) freqs = frequencies( concat((value for value in edges.values() if len(value) == 2))) corners = [id for id, count in freqs.items() if count == 2] if len(corners) != 4: raise ValueError("Wrong number of corners!") return corners
def merge(*tables): # Get common sub expression child = common_subexpression(*tables) if not child: raise ValueError("No common sub expression found for input tables") result = Merge(child, tables) if not isdistinct(result.columns): raise ValueError("Repeated columns found: " + ', '.join(k for k, v in frequencies(result.columns).items() if v > 1)) return result
def merge(*exprs): # Get common sub expression try: child = common_subexpression(*exprs) except: raise ValueError("No common sub expression found for input expressions") result = Merge(child, exprs) if not isdistinct(result.fields): raise ValueError("Repeated columns found: " + ', '.join(k for k, v in frequencies(result.fields).items() if v > 1)) return result
def build(self, section, texts, **kwargs): sec = self.sections[section] frequency = kwargs.pop('frequency', 0) if frequency > 0: freq_dict = frequencies([y for x in texts for y in x]) texts = [[y for y in x if freq_dict[y] > frequency] for x in texts] sec.dictionary = corpora.Dictionary(texts) sec.corpus = [sec.dictionary.doc2bow(text) for text in texts] sec.tfidf = models.TfidfModel(sec.corpus) sec.lsi = models.LsiModel(sec.tfidf[sec.corpus], id2word=sec.dictionary, num_topics=kwargs.pop('num_topics', 250)) sec.index = similarities.MatrixSimilarity(sec.lsi[sec.corpus])
def __init__(self, vqa, vqaRes, n=2): VQAEval.__init__(self, vqa, vqaRes, n) print "Initialize class normalized evaluation..." # calculates answer frequencies over the current answers (train, val, # etc.) quesIds = [x for x in self.params['question_id']] gts = {} for quesId in quesIds: gts[quesId] = self.vqa.qa[quesId] # consider frequencies for all answers all_answers = [x['answer'] for y in gts for x in gts[y]['answers']] self.answer2freq = frequencies(all_answers) print "Class normalized evaluation initialized!"
def merge(*exprs, **kwargs): # Get common sub expression exprs = exprs + tuple(label(v, k) for k, v in kwargs.items()) try: child = common_subexpression(*exprs) except: raise ValueError( "No common sub expression found for input expressions") result = Merge(child, exprs) if not isdistinct(result.fields): raise ValueError("Repeated columns found: " + ', '.join( k for k, v in frequencies(result.fields).items() if v > 1)) return result
def test_worker_breaks_and_returns(c, s, a): future = c.submit(slowinc, 1, delay=0.1) for i in range(10): future = c.submit(slowinc, future, delay=0.1) yield _wait(future) a.batched_stream.comm.close() yield gen.sleep(0.1) start = time() yield _wait(future) end = time() assert end - start < 1 assert frequencies(s.task_state.values()) == {'memory': 1, 'released': 10}
def test_worker_breaks_and_returns(c, s, a): future = c.submit(slowinc, 1, delay=0.1) for i in range(10): future = c.submit(slowinc, future, delay=0.1) yield wait(future) a.batched_stream.comm.close() yield gen.sleep(0.1) start = time() yield wait(future, timeout=10) end = time() assert end - start < 1 states = frequencies(ts.state for ts in s.tasks.values()) assert states == {'memory': 1, 'released': 10}
def merge(*exprs, **kwargs): if len(exprs) + len(kwargs) == 1: if exprs: return exprs[0] if kwargs: [(k, v)] = kwargs.items() return v.label(k) # Get common sub expression exprs += tuple(label(v, k) for k, v in sorted(kwargs.items(), key=first)) child = common_subexpression(*exprs) result = Merge(child, exprs) if not isdistinct(result.fields): raise ValueError( "Repeated columns found: " + ', '.join( k for k, v in frequencies(result.fields).items() if v > 1), ) return result
def test_worker_breaks_and_returns(c, s, a): future = c.submit(slowinc, 1, delay=0.1) for i in range(10): future = c.submit(slowinc, future, delay=0.1) yield wait(future) yield a.batched_stream.comm.close() yield gen.sleep(0.1) start = time() yield wait(future, timeout=10) end = time() assert end - start < 1 states = frequencies(ts.state for ts in s.tasks.values()) assert states == {"memory": 1, "released": 10}
def test_adapt_quickly(): """ We want to avoid creating and deleting workers frequently Instead we want to wait a few beats before removing a worker in case the user is taking a brief pause between work """ cluster = yield LocalCluster(0, asynchronous=True, processes=False, scheduler_port=0, silence_logs=False, diagnostics_port=None) client = yield Client(cluster, asynchronous=True) adapt = Adaptive(cluster.scheduler, cluster, interval=20, wait_count=5, maximum=10) try: future = client.submit(slowinc, 1, delay=0.100) yield wait(future) assert len(adapt.log) == 1 # Scale up when there is plenty of available work futures = client.map(slowinc, range(1000), delay=0.100) while frequencies(pluck(1, adapt.log)) == {'up': 1}: yield gen.sleep(0.01) assert len(adapt.log) == 2 assert 'up' in adapt.log[-1] d = [x for x in adapt.log[-1] if isinstance(x, dict)][0] assert 2 < d['n'] <= adapt.maximum while len(cluster.scheduler.workers) < adapt.maximum: yield gen.sleep(0.01) del futures while len(cluster.scheduler.workers) > 1: yield gen.sleep(0.01) # Don't scale up for large sequential computations x = yield client.scatter(1) for i in range(100): x = client.submit(slowinc, x) yield gen.sleep(0.1) assert len(cluster.scheduler.workers) == 1 finally: yield client.close() yield cluster.close()
def decide_worker(dependencies, stacks, who_has, restrictions, key): """ Decide which worker should take task >>> dependencies = {'c': {'b'}, 'b': {'a'}} >>> stacks = {('alice', 8000): ['z'], ('bob', 8000): []} >>> who_has = {'a': {('alice', 8000)}} >>> restrictions = {} We choose the worker that has the data on which 'b' depends (alice has 'a') >>> decide_worker(dependencies, stacks, who_has, restrictions, 'b') ('alice', 8000) If both Alice and Bob have dependencies then we choose the less-busy worker >>> who_has = {'a': {('alice', 8000), ('bob', 8000)}} >>> decide_worker(dependencies, stacks, who_has, restrictions, 'b') ('bob', 8000) Optionally provide restrictions of where jobs are allowed to occur >>> restrictions = {'b': {'alice', 'charile'}} >>> decide_worker(dependencies, stacks, who_has, restrictions, 'b') ('alice', 8000) """ deps = dependencies[key] workers = frequencies(w for dep in deps for w in who_has[dep]) if not workers: workers = stacks if key in restrictions: r = restrictions[key] workers = {w for w in workers if w[0] in r} # TODO: nonlinear if not workers: workers = {w for w in stacks if w[0] in r} if not workers: raise ValueError("Task has no valid workers", key, r) if not workers: raise ValueError("No workers found") worker = min(workers, key=lambda w: len(stacks[w])) return worker
def merge(*exprs, **kwargs): if len(exprs) + len(kwargs) == 1: if exprs: return exprs[0] if kwargs: [(k, v)] = kwargs.items() return v.label(k) # Get common sub expression exprs += tuple(label(v, k) for k, v in sorted(kwargs.items(), key=first)) child = common_subexpression(*exprs) result = Merge(child, exprs) if not isdistinct(result.fields): raise ValueError( "Repeated columns found: " + ', '.join( k for k, v in frequencies(result.fields).items() if v > 1 ), ) return result
def merge(*exprs, **kwargs): if len(exprs) + len(kwargs) == 1: if exprs: return exprs[0] if kwargs: [(k, v)] = kwargs.items() return v.label(k) # Get common sub expression exprs = exprs + tuple(label(v, k) for k, v in kwargs.items()) try: child = common_subexpression(*exprs) except: raise ValueError("No common sub expression found for input expressions") result = Merge(child, exprs) if not isdistinct(result.fields): raise ValueError("Repeated columns found: " + ', '.join(k for k, v in frequencies(result.fields).items() if v > 1)) return result
def test_avoid_churn(): """ We want to avoid creating and deleting workers frequently Instead we want to wait a few beats before removing a worker in case the user is taking a brief pause between work """ cluster = yield LocalCluster(0, asynchronous=True, processes=False, scheduler_port=0, silence_logs=False, diagnostics_port=None) client = yield Client(cluster, asynchronous=True) try: adapt = Adaptive(cluster.scheduler, cluster, interval='20 ms', wait_count=5) for i in range(10): yield client.submit(slowinc, i, delay=0.040) yield gen.sleep(0.040) assert frequencies(pluck(1, adapt.log)) == {'up': 1} finally: yield client.close() yield cluster.close()
def merge(*exprs, **kwargs): if len(exprs) + len(kwargs) == 1: # we only have one object so don't need to construct a merge if exprs: # we only have a positional argumnent, return it unchanged return exprs[0] if kwargs: # we only have a single keyword argument, label it and return it [(k, v)] = kwargs.items() return v.label(k) # label all the kwargs and sort in key order exprs = tuple(concatv( (_wrap(expr, '_%s' % n) for n, expr in enumerate(exprs)), ( label(_wrap(v, k), k) for k, v in sorted(kwargs.items(), key=first) ), )) if all(ndim(expr) == 0 for expr in exprs): raise TypeError('cannot merge all scalar expressions') result = Merge( exprs, varargsexpr(exprs), maxshape(map(shape, exprs)), ) if not isdistinct(result.fields): raise ValueError( "Repeated columns found: " + ', '.join( k for k, v in frequencies(result.fields).items() if v > 1 ), ) return result
#!/usr/bin/env python import os import toolz import json toolz.frequencies def messages_from_file(fname): with file(fname) as ff: return json.loads(ff.read()) def gen(): for fname in os.listdir("../resources"): if not fname.startswith("I3Live"): continue for m in messages_from_file("../resources/" + fname): yield m["service"] print toolz.frequencies(gen()) print "OK"
import sys import itertools import toolz from gensim.models import word2vec data_file = sys.argv[1] sentences = [ s for s in word2vec.LineSentence(data_file) if toolz.count(toolz.unique(s)) >= 2 ] cmb = toolz.frequencies( toolz.mapcat(lambda s: itertools.combinations(sorted(toolz.unique(s)), 2), sentences)) for (k1, k2), v in sorted(cmb.items(), key=lambda x: -x[1]): print(f"item1 = {k1}, item2 = {k2}, freq = {v}")
"whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves" ] d = tz.pipe( D, c.map(lambda x: x.strip()), c.map(lambda x: x.lower()), c.map(lambda x: x.translate(str.maketrans('', '', string.punctuation))), c.map(lambda x: re.sub('[0-9]+', '', x)), c.map(lambda x: x.split()), c.map(lambda x: [word for word in x if word not in stops]), list) d_sub = d[:500] tf = {id: tz.frequencies(doc) for id, doc in enumerate(d_sub)} df = pd.DataFrame(tf).fillna(0) words = df.index ds = df.values.T ds = ds.astype(int) def DataTrans(x): """Turn the data into the desired structure""" N_d = np.sum(x) V = len(x) row = 0
""" Example Toolz From `toolz` pypi page https://pypi.python.org/pypi/toolz """ # %% from toolz import compose, frequencies, partial from toolz.curried import map def stem(word): """ Stem word to primitive form """ return word.lower().rstrip(",.!:;'-\"").lstrip("'\"") wordcount = compose(frequencies, map(stem), str.split) sentence = "This cat jumped over this other cat!" wordcount(sentence) # {'this': 2, 'cat': 2, 'jumped': 1, 'over': 1, 'other': 1} # %% print("sentance: {}".format(sentence)) print("split: {}".format(str.split(sentence))) print("stem: {}".format(stem(sentence))) print("frequencies: {}".format(frequencies(sentence)))
def analyze_text(texts, cleaner): return word_ratio( toolz.frequencies( filter(word_is_desired, itertools.chain(*map(cleaner.clean_text, texts)))))
def _judgeMultiSubmit(judgeScores): # Test this judges = [x.judge for x in judgeScores] fs = toolz.frequencies(judges) toomany = toolz.valfilter(lambda num: num > 1, fs) return [judge.username for judge in toomany.keys()]
def count_ingredient_occurances(foods: List[FoodEntry], ingredients: List[str]) -> int: freqs = frequencies(concat(map(lambda x: x.ingredients, foods))) return sum((count for ingredient, count in freqs.items() if ingredient in ingredients))
def test_frequencies(): frequencies(big_data)
def test_frequencies_small(): for i in range(1000): frequencies(small_data)
sentences = list(word2vec.LineSentence(data_file)) dic = Dictionary(sentences) corpus = [dic.doc2bow(s) for s in sentences if len(s) >= 2] lda = LdaModel(corpus = corpus, id2word = dic, num_topics = topic_num, alpha = alpha, random_state = 1) doc_topics = [lda[c] for c in corpus] avg_doc_topics = mean([len(t) for t in doc_topics]) print(f"topics num of doc = {avg_doc_topics}") topic_freq = frequencies(concat([[x[0] for x in t] for t in doc_topics])) wb = Workbook() sh1 = wb.active sh1.title = 'topics' sh1.append(['topic', 'freq', 'item', 'prob']) for i in range(topic_num): for t in lda.get_topic_terms(i): item = dic[t[0]] sh1.append([i, topic_freq[i], item, t[1]]) sh1.auto_filter.ref = f"A1:D{sh1.max_row}"
def Estimate_Frequency(Set) from toolz import frequencies wordcount_x = frequencies(' '.join(Set).split(' ')) # Kraino is a framework that helps in fast prototyping Visual Turing Test models # This function takes wordcounts and returns word2index - mapping from words into indices, # and index2word - mapping from indices to words and building the vocabulary. from kraino.utils.input_output_space import build_vocabulary word2index_x, index2word_x = build_vocabulary( this_wordcount=wordcount_x, truncate_to_most_frequent=0) word2index_x return word2index_x
def stem(word): """ Stem word to primitive form """ return word.lower().rstrip(",.!:;'-\"").lstrip("'\"") wordcount = compose(frequencies, map(stem), str.split) sentence = "This cat jumped over this other cat!" wordcount(sentence) # {'this': 2, 'cat': 2, 'jumped': 1, 'over': 1, 'other': 1} # %% print("sentance: {}".format(sentence)) print("split: {}".format(str.split(sentence))) print("stem: {}".format(stem(sentence))) print("frequencies: {}".format(frequencies(sentence))) #%% # # Curry. # from toolz.functoolz import curry @curry def add(x, y, echo=False): if echo: print(f"x = {x}") print(f"y = {y}") return x + y
dp = data_provider.select['daquar-triples'] train_text_representation = dp['text'](train_or_test='train') n_elements = 10 #print('== Questions:') #print_list(train_text_representation['x'][:n_elements]) #print('== Answers:') #print_list(train_text_representation['y'][:n_elements]) #print('== Image Names:') #print_list(train_text_representation['img_name'][:n_elements]) from toolz import frequencies train_raw_x = train_text_representation['x'] # we start from building the frequencies table wordcount_x = frequencies(' '.join(train_raw_x).split(' ')) # print the most and least frequent words n_show = 5 #print(sorted(wordcount_x.items(), key=lambda x: x[1], reverse=True)[:n_show]) #print(sorted(wordcount_x.items(), key=lambda x: x[1])[:n_show]) # Kraino is a framework that helps in fast prototyping Visual Turing Test models from kraino.utils.input_output_space import build_vocabulary # This function takes wordcounts and returns word2index - mapping from words into indices, # and index2word - mapping from indices to words. word2index_x, index2word_x = build_vocabulary( this_wordcount=wordcount_x, truncate_to_most_frequent=0) #print (word2index_x)
for i in xrange(len(langs)): l = langs[i] expected = [sys.argv[1+i] if np.any(np.equal(l,x).all(1)) else '' for x in all_data] labels.append(expected) labels = [[l for l in list(label) if len(l) > 0] for label in zip(*labels)] mlb = MultiLabelBinarizer() indicators = mlb.fit_transform(labels) train, test, y_train, y_test = train_test_split(all_data, indicators) # clf = OneVsRestClassifier(GMM(n_components = 4, covariance_type='full')) # clf = OneVsRestClassifier(svm.SVC()) clf = OneVsRestClassifier(SGDClassifier()) clf.fit(train, y_train) train_pred = clf.predict(train) test_pred = clf.predict(test) for i in xrange(train_pred.shape[1]): print mlb.classes_[i] print "Train data" print t.frequencies(y_train[:,i]), t.frequencies(zip(y_train[:,i],train_pred[:,i])) print metrics.classification_report(y_train[:,i], train_pred[:,i]) print "Test data" print t.frequencies(y_test[:,i]), t.frequencies(zip(y_test[:,i],test_pred[:,i])) print metrics.classification_report(y_test[:,i], test_pred[:,i]) print "" #test_set = ["features/de-test-10000.features", "features/fr-test-10000.features"]
def analyze_poems(poems, cleaner): return word_ratio( toolz.frequencies( filter(word_is_desired, itertools.chain(*map(cleaner.clean_poem, poems)))))
def decide_worker(dependencies, stacks, who_has, restrictions, nbytes, key): """ Decide which worker should take task >>> dependencies = {'c': {'b'}, 'b': {'a'}} >>> stacks = {('alice', 8000): ['z'], ('bob', 8000): []} >>> who_has = {'a': {('alice', 8000)}} >>> nbytes = {'a': 100} >>> restrictions = {} We choose the worker that has the data on which 'b' depends (alice has 'a') >>> decide_worker(dependencies, stacks, who_has, restrictions, nbytes, 'b') ('alice', 8000) If both Alice and Bob have dependencies then we choose the less-busy worker >>> who_has = {'a': {('alice', 8000), ('bob', 8000)}} >>> decide_worker(dependencies, stacks, who_has, restrictions, nbytes, 'b') ('bob', 8000) Optionally provide restrictions of where jobs are allowed to occur >>> restrictions = {'b': {'alice', 'charile'}} >>> decide_worker(dependencies, stacks, who_has, restrictions, nbytes, 'b') ('alice', 8000) If the task requires data communication, then we choose to minimize the number of bytes sent between workers. This takes precedence over worker occupancy. >>> dependencies = {'c': {'a', 'b'}} >>> who_has = {'a': {('alice', 8000)}, 'b': {('bob', 8000)}} >>> nbytes = {'a': 1, 'b': 1000} >>> stacks = {('alice', 8000): [], ('bob', 8000): []} >>> decide_worker(dependencies, stacks, who_has, {}, nbytes, 'c') ('bob', 8000) """ deps = dependencies[key] workers = frequencies(w for dep in deps for w in who_has[dep]) if not workers: workers = stacks if key in restrictions: r = restrictions[key] workers = {w for w in workers if w[0] in r} # TODO: nonlinear if not workers: workers = {w for w in stacks if w[0] in r} if not workers: raise ValueError("Task has no valid workers", key, r) if not workers or not stacks: raise ValueError("No workers found") commbytes = {w: sum(nbytes[k] for k in dependencies[key] if w not in who_has[k]) for w in workers} minbytes = min(commbytes.values()) workers = {w for w, nb in commbytes.items() if nb == minbytes} worker = min(workers, key=lambda w: len(stacks[w])) return worker
def daquar_qa_triples( path=None, train_or_test='train', keep_top_qa_pairs=0, **kwargs): """ DAQUAR question answer pairs. In: path - path to DAQUAR root folder, if None then default path is chosen by default None train_or_test - switch between train and test set; value belongs to \{'train', 'val', 'test'\} by default 'train' keep_top_qa_pairs - filter out question-answer pairs to the keep_top_qa_pairs if positive; by default 0 Out: x - textual questions y - textual answers img_name - names of the images img_ind - image indices that correspond to x question_id - empty list as it is unused in DAQUAR end_of_question - end of question token end_of_answer - end of answer token answer_words_delimiter - delimiter for multiple word answers """ if path is None: curr_dir = os.path.dirname(os.path.realpath(__file__)) path = os.path.join(curr_dir, '..', '..', 'data', 'daquar') if train_or_test == 'val': # we don't have a well established split train_or_test = 'train' xy_list = file2list( os.path.join(path,'qa.894.raw.'+train_or_test+'.format_triple')) # create a dictionary of allowed qa pairs all_answers = xy_list[1::3] freq = frequencies(all_answers) if keep_top_qa_pairs <= 0: most_frequent_answers = sorted( freq.items(), key=lambda x:x[1], reverse=True) else: most_frequent_answers = sorted( freq.items(), key=lambda x:x[1], reverse=True)[:keep_top_qa_pairs] allowed_answers_dict = dict(most_frequent_answers) # x_list = [] y_list = [] img_name_list = [] img_ind_list = [] for x, y, image_name in zip(xy_list[::3], xy_list[1::3], xy_list[2::3]): if y in allowed_answers_dict: x_list.append(x) y_list.append(y) img_name_list.append(image_name) img_num = re.search('(?<=image)[0-9]+', image_name).group(0) img_ind_list.append(int(img_num)-1) return {'x':x_list, 'y':y_list, 'img_name':img_name_list, 'img_ind': img_ind_list, 'question_id': [], 'end_of_question':'?', 'end_of_answer':'', 'answer_words_delimiter':','}
def vqa_general(path=None, train_or_test='train', dataset_type='mscoco', task_type='OpenEnded', annotation_year='2014', question_year='2015', image_name_template='COCO_2014_{0:0=12}', answer_mode='single_random', keep_top_qa_pairs=0): """ VT-Vision-Lab VQA question answeir pairs. It is a general interface. In: path - path to VQA root folder, if None then default path is chosen; by default None train_or_test - switch between train and test set; value belongs to \{'train', 'val', 'test', 'test_dev'\} by default 'train' dataset_type - type of dataset, e.g. 'mscoco' task_type - type of the task, e.g. 'OpenEnded' annotation_year - annotation year question_year - question year image_name_template - template for giving names to images answer_mode - possible answer modes: 'single_random' - single answer, randomly chosen 'single_confident' - single answer, randomly chosen among the confident; if there is no confident then randomly chosen (the same as single) 'single_frequent' - the most frequent answer 'all' - with one question all answers 'all_repeat' - all answers by repeating the same question 'all_repeat_confidentonly' - all answers that are confident (repeats the same question) keep_top_qa_pairs - filter out question-answer pairs to the keep_top_qa_pairs if positive; by default 0 Out: x - textual questions y - textual answers img_name - names of the images img_ind - image indices that correspond to x question_id - list of question indices end_of_question - end of question token end_of_answer - end of answer token answer_words_delimiter - delimiter for multiple word answers anno_path - constructed path to annotations questions_path - constructed path to questions """ def preprocess_question(q): q_tmp = q.strip().lower().encode('utf8') if q_tmp[-1] == '?' and q_tmp[-2] != ' ': # separate word token from the question mark q_tmp = q_tmp[:-1] + ' ?' # remove question mark if q_tmp[-1] == '?': q_tmp = q_tmp[:-1] return q_tmp # assert answer_mode in ['single_random', 'single_confident', 'single_frequent', 'all', 'all_repeat', 'all_repeat_confidentonly'] assert task_type in ['OpenEnded', 'MultipleChoice'], \ 'The task is either ''OpenEnded'' of ''MultipleChoice''' assert dataset_type in ['mscoco', 'abstract_v002'], \ 'The type of dataset is eigher ''mscoco'' or ''abstract_v002''' vqa_dict = vqa_get_object( path=path, train_or_test=train_or_test, dataset_type=dataset_type, task_type=task_type, annotation_year=annotation_year, question_year=question_year) vqa = vqa_dict['vqa_object'] # questions can be filtered, e.g. by the question type ann_ids = vqa.getQuesIds() anns = vqa.loadQA(ann_ids) # process annotations question_id_list = [] image_name_list = [] image_id_list = [] x_list = [] y_list = [] # return only questions if there are no annotations if anns == []: for ques in vqa.questions['questions']: question = preprocess_question(ques['question']) x_list.append(question) question_id_list.append(ques['question_id']) image_id = ques['image_id'] image_name = image_name_template.format(image_id) image_name_list.append(image_name) image_id_list.append(image_id) # create a dictionary of allowed qa pairs all_answers = [x['answer'] for anno in anns for x in anno['answers']] freq = frequencies(all_answers) if keep_top_qa_pairs <= 0: most_frequent_answers = sorted( freq.items(), key=lambda x:x[1], reverse=True) else: most_frequent_answers = sorted( freq.items(), key=lambda x:x[1], reverse=True)[:keep_top_qa_pairs] allowed_answers_dict = dict(most_frequent_answers) # for anno in anns: image_id = anno['image_id'] image_name = image_name_template.format(image_id) question_id = anno['question_id'] question = preprocess_question(vqa.qqa[question_id]['question']) assert image_id == vqa.qqa[question_id]['image_id'], \ 'image id of the question and answer are different' # randomizing the answers list randomized_answers = copy.deepcopy(anno['answers']) np.random.shuffle(randomized_answers) randomized_allowed_answers_list = \ [x for x in randomized_answers if x['answer'] in allowed_answers_dict] if randomized_allowed_answers_list == []: continue # if answer_mode == 'single_random': answer = randomized_allowed_answers_list[0]['answer'] elif answer_mode == 'single_confident': # if there is no confident answer, take a random one confidence_list = [x['answer_confidence'] \ for x in randomized_allowed_answers_list] yes_list = [j for j,x in enumerate(confidence_list) if x == 'yes'] if yes_list == []: answer = randomized_allowed_answers_list[0]['answer'] else: answer = randomized_allowed_answers_list[yes_list[0]]['answer'] elif answer_mode == 'single_frequent': tmp = frequencies([x['answer'] for x in randomized_allowed_answers_list]) answer = sorted(tmp.items(), key=lambda x: x[1], reverse=True)[0][0] elif answer_mode == 'all': raise NotImplementedError() elif answer_mode == 'all_repeat': answer_list_all_mode = [] for answer in randomized_allowed_answers_list: answer_list_all_mode.append(answer['answer'].encode('utf8')) elif answer_mode == 'all_repeat_confidentonly': # like repeat but consider only confident answers confidence_list = [x['answer_confidence'] \ for x in randomized_allowed_answers_list] yes_list = [j for j,x in enumerate(confidence_list) if x == 'yes'] if yes_list == []: # we keep only confident qa pairs continue answer_list_all_mode = [] for answer_no, answer in enumerate(randomized_allowed_answers_list): if answer_no in yes_list: answer_list_all_mode.append(answer['answer'].encode('utf8')) else: raise NotImplementedError() if 'single' in answer_mode: answer = answer.encode('utf8') x_list.append(question) y_list.append(answer) image_name_list.append(image_name) image_id_list.append(image_id) question_id_list.append(question_id) elif 'all' in answer_mode: num_answers_all_mode = len(answer_list_all_mode) x_list.extend([question]*num_answers_all_mode) image_name_list.extend([image_name]*num_answers_all_mode) image_id_list.extend([image_id]*num_answers_all_mode) question_id_list.extend([question_id]*num_answers_all_mode) y_list.extend(answer_list_all_mode) else: raise NotImplementedError() return {'x':x_list, 'y':y_list, 'img_name':image_name_list, 'img_ind': image_id_list, 'question_id': question_id_list, 'end_of_question':'?', 'end_of_answer':'', 'answer_words_delimiter':' ', 'vqa_object':vqa, 'questions_path':vqa_dict['questions_path'], 'anno_path':vqa_dict['anno_path']}
def parse(line): parts = line.split() x, y = int(parts[0].rstrip(',')), int(parts[1]) cmap[(x, y)] = 'A' return x, y points = list(map(parse, sys.stdin)) # find corners min_x = min(points, key=lambda x: x[0])[0] min_y = min(points, key=lambda x: x[1])[1] max_x = max(points, key=lambda x: x[0])[0] max_y = max(points, key=lambda x: x[1])[1] grid = {} for y in range(min_y, max_y + 1): for x in range(min_x, max_x + 1): c = closest(x, y, points) if c: grid[(x, y)] = c print(cmap[c], end='') else: print('.', end='') print() # print(grid) bad_points = keyfilter( lambda x: x[0] in [max_x, min_x] or x[1] in [max_y, min_y], grid).values() grid = valfilter(lambda x: x not in bad_points, grid) # print(grid) print(max(frequencies(grid.values()).values()))
def blockwise(func, out_ind, *args, **kwargs): """ Tensor operation: Generalized inner and outer products A broad class of blocked algorithms and patterns can be specified with a concise multi-index notation. The ``blockwise`` function applies an in-memory function across multiple blocks of multiple inputs in a variety of ways. Many dask.array operations are special cases of blockwise including elementwise, broadcasting, reductions, tensordot, and transpose. Parameters ---------- func : callable Function to apply to individual tuples of blocks out_ind : iterable Block pattern of the output, something like 'ijk' or (1, 2, 3) *args : sequence of Array, index pairs Sequence like (x, 'ij', y, 'jk', z, 'i') **kwargs : dict Extra keyword arguments to pass to function dtype : np.dtype Datatype of resulting array. concatenate : bool, keyword only If true concatenate arrays along dummy indices, else provide lists adjust_chunks : dict Dictionary mapping index to function to be applied to chunk sizes new_axes : dict, keyword only New indexes and their dimension lengths Examples -------- 2D embarrassingly parallel operation from two arrays, x, and y. >>> z = blockwise(operator.add, 'ij', x, 'ij', y, 'ij', dtype='f8') # z = x + y # doctest: +SKIP Outer product multiplying x by y, two 1-d vectors >>> z = blockwise(operator.mul, 'ij', x, 'i', y, 'j', dtype='f8') # doctest: +SKIP z = x.T >>> z = blockwise(np.transpose, 'ji', x, 'ij', dtype=x.dtype) # doctest: +SKIP The transpose case above is illustrative because it does same transposition both on each in-memory block by calling ``np.transpose`` and on the order of the blocks themselves, by switching the order of the index ``ij -> ji``. We can compose these same patterns with more variables and more complex in-memory functions z = X + Y.T >>> z = blockwise(lambda x, y: x + y.T, 'ij', x, 'ij', y, 'ji', dtype='f8') # doctest: +SKIP Any index, like ``i`` missing from the output index is interpreted as a contraction (note that this differs from Einstein convention; repeated indices do not imply contraction.) In the case of a contraction the passed function should expect an iterable of blocks on any array that holds that index. To receive arrays concatenated along contracted dimensions instead pass ``concatenate=True``. Inner product multiplying x by y, two 1-d vectors >>> def sequence_dot(x_blocks, y_blocks): ... result = 0 ... for x, y in zip(x_blocks, y_blocks): ... result += x.dot(y) ... return result >>> z = blockwise(sequence_dot, '', x, 'i', y, 'i', dtype='f8') # doctest: +SKIP Add new single-chunk dimensions with the ``new_axes=`` keyword, including the length of the new dimension. New dimensions will always be in a single chunk. >>> def f(x): ... return x[:, None] * np.ones((1, 5)) >>> z = blockwise(f, 'az', x, 'a', new_axes={'z': 5}, dtype=x.dtype) # doctest: +SKIP New dimensions can also be multi-chunk by specifying a tuple of chunk sizes. This has limited utility as is (because the chunks are all the same), but the resulting graph can be modified to achieve more useful results (see ``da.map_blocks``). >>> z = blockwise(f, 'az', x, 'a', new_axes={'z': (5, 5)}, dtype=x.dtype) # doctest: +SKIP If the applied function changes the size of each chunk you can specify this with a ``adjust_chunks={...}`` dictionary holding a function for each index that modifies the dimension size in that index. >>> def double(x): ... return np.concatenate([x, x]) >>> y = blockwise(double, 'ij', x, 'ij', ... adjust_chunks={'i': lambda n: 2 * n}, dtype=x.dtype) # doctest: +SKIP Include literals by indexing with None >>> y = blockwise(add, 'ij', x, 'ij', 1234, None, dtype=x.dtype) # doctest: +SKIP """ out = kwargs.pop('name', None) # May be None at this point token = kwargs.pop('token', None) dtype = kwargs.pop('dtype', None) adjust_chunks = kwargs.pop('adjust_chunks', None) new_axes = kwargs.pop('new_axes', {}) align_arrays = kwargs.pop('align_arrays', True) # Input Validation if len(set(out_ind)) != len(out_ind): raise ValueError("Repeated elements not allowed in output index", [k for k, v in toolz.frequencies(out_ind).items() if v > 1]) new = (set(out_ind) - {a for arg in args[1::2] if arg is not None for a in arg} - set(new_axes or ())) if new: raise ValueError("Unknown dimension", new) from .core import Array, unify_chunks, normalize_arg if dtype is None: raise ValueError("Must specify dtype of output array") if align_arrays: chunkss, arrays = unify_chunks(*args) else: arginds = [(a, i) for (a, i) in toolz.partition(2, args) if i is not None] if arginds: arg, ind = max(arginds, key=lambda ai: len(ai[1])) chunkss = dict(zip(ind, arg.chunks)) else: chunkss = {} arrays = args[::2] for k, v in new_axes.items(): if not isinstance(v, tuple): v = (v,) chunkss[k] = v arginds = list(zip(arrays, args[1::2])) for arg, ind in arginds: if hasattr(arg, 'ndim') and hasattr(ind, '__len__') and arg.ndim != len(ind): raise ValueError("Index string %s does not match array dimension %d" % (ind, arg.ndim)) numblocks = {a.name: a.numblocks for a, ind in arginds if ind is not None} dependencies = [] arrays = [] # Normalize arguments argindsstr = [] for a, ind in arginds: if ind is None: a = normalize_arg(a) a, collections = unpack_collections(a) dependencies.extend(collections) else: arrays.append(a) a = a.name argindsstr.extend((a, ind)) # Normalize keyword arguments kwargs2 = {} for k, v in kwargs.items(): v = normalize_arg(v) v, collections = unpack_collections(v) dependencies.extend(collections) kwargs2[k] = v # Finish up the name if not out: out = '%s-%s' % (token or utils.funcname(func).strip('_'), base.tokenize(func, out_ind, argindsstr, dtype, **kwargs)) graph = core_blockwise(func, out, out_ind, *argindsstr, numblocks=numblocks, dependencies=dependencies, new_axes=new_axes, **kwargs2) graph = HighLevelGraph.from_collections(out, graph, dependencies=arrays + dependencies) chunks = [chunkss[i] for i in out_ind] if adjust_chunks: for i, ind in enumerate(out_ind): if ind in adjust_chunks: if callable(adjust_chunks[ind]): chunks[i] = tuple(map(adjust_chunks[ind], chunks[i])) elif isinstance(adjust_chunks[ind], numbers.Integral): chunks[i] = tuple(adjust_chunks[ind] for _ in chunks[i]) elif isinstance(adjust_chunks[ind], (tuple, list)): chunks[i] = tuple(adjust_chunks[ind]) else: raise NotImplementedError( "adjust_chunks values must be callable, int, or tuple") chunks = tuple(chunks) return Array(graph, out, chunks, dtype=dtype)
corpus = [dic.doc2bow(s) for s in sentences] lda = LdaModel(corpus=corpus, id2word=dic, num_topics=topic_num, alpha=alpha, random_state=1) doc_topics = [lda[c] for c in corpus] avg_doc_topics = mean([len(t) for t in doc_topics]) print(f"topics num of doc = {avg_doc_topics}") topic_freq = frequencies([t[0] for dt in doc_topics for t in dt]) print('----------') for i in range(topic_num): items = [(dic[t[0]], t[1]) for t in lda.get_topic_terms(i, topn=5)] freq = topic_freq[i] if i in topic_freq else 0 print(f"topic_id = {i}, freq = {freq}, items = {items}") print('----------') for i in range(len(corpus)): dts = lda.get_document_topics(corpus[i], per_word_topics=True) for dt in dts[2]: