def getVal(self, x, y): # Create a new generator for all possible string combinations at length verticalPerms = itertools.permutations(list(self.characterSet), len(self.vertClues)) horizontalPerms = itertools.permutations(list(self.characterSet), len(self.horizClues)) # Compile the regex for this square hre = re.compile('^' + self.horizClues[x] + '$') vre = re.compile('^' + self.vertClues[y] + '$') # Make a copy of the generators so we can iterate over them multiple times hp, hp_b = itertools.tee(horizontalPerms) vp, _ = itertools.tee(verticalPerms) # For every possible combination of strings in vertical and horizontal perms for vstr in vp: hp, hp_b = itertools.tee(hp) for hstr in hp_b: # If we find an intersection of strings that match the regex, return the hstr[x] for that string as the answer if (list(hstr)[x] == list(vstr)[y] and hre.match("".join(list(hstr))) and vre.match("".join(list(vstr)))): retVal = "Solution (" + str(x) + "," + str(y) + ") = " + str(list(hstr)[x]) return retVal # This should never happen for a valid puzzle, no intersection found return "Solution (" + str(x) + "," + str(y) + ") Not found"
def test_iterables(self): import itertools iterables = [ itertools.chain(), itertools.count(), itertools.cycle([]), itertools.dropwhile(bool, []), itertools.groupby([]), itertools.ifilter(None, []), itertools.ifilterfalse(None, []), itertools.imap(None, []), itertools.islice([], 0), itertools.izip(), itertools.repeat(None), itertools.starmap(bool, []), itertools.takewhile(bool, []), itertools.tee([])[0], itertools.tee([])[1], ] for it in iterables: assert hasattr(it, '__iter__') assert iter(it) is it assert hasattr(it, 'next') assert callable(it.next)
def performance_evaluation(params, synthesizer=None, iter_train=None, iter_test=None): """ """ shuffle_order = params['shuffle_order'] negative_shuffle_ratio = params['negative_shuffle_ratio'] vectorizer_complexity = params['vectorizer_complexity'] # Copy training sample iterable for sequence synthesis and producing mixed sample set. iter_train, iter_train_syn, iter_seq_true = tee(iter_train, 3) # Copy test sample iterable used for evaluation. iter_test, iter_test_ = tee(iter_test) # Train TrueSamplesModel classifier. Evaluate. logger.info('Fit estimator on original data and evaluate the estimator.') roc_t, apr_t = fit_evaluate(iterable_train=iter_train, iterable_test=iter_test, negative_shuffle_ratio=negative_shuffle_ratio, shuffle_order=shuffle_order, vectorizer_complexity=vectorizer_complexity) # Produce synthesied sequences generator. iterable_seq_syn = synthesizer.fit_sample(iter_train_syn) # Mix synthesized and true samples. iterable_seq_mixed = chain(iterable_seq_syn, iter_seq_true) # Train MixedSamplesModel classifier. Evaluate. logger.info( 'Fit estimator on original + sampled data and evaluate the estimator.') roc_s, apr_s = fit_evaluate(iterable_train=iterable_seq_mixed, iterable_test=iter_test_, negative_shuffle_ratio=negative_shuffle_ratio, shuffle_order=shuffle_order, vectorizer_complexity=vectorizer_complexity) return roc_t, apr_t, roc_s, apr_s
def evalDeleteWhere(ctx, u): """ http://www.w3.org/TR/sparql11-update/#deleteWhere """ res = {} res["type"] = "DELETEWHERE" res["delta"] = {} _res = evalBGP(ctx, u.triples) for g in u.quads: cg = ctx.dataset.get_context(g) c = ctx.pushGraph(cg) _res = _join(_res, list(evalBGP(c, u.quads[g]))) for c in _res: g = ctx.graph filled, filled_delta = tee(_fillTemplate(u.triples, c)) _append(res["delta"], 'default', 'removals', list(filled_delta)) g -= filled for g in u.quads: cg = ctx.dataset.get_context(c.get(g)) filledq, filledq_delta = tee(_fillTemplate(u.quads[g], c)) _append(res["delta"], cg.identifier, 'removals', list(filledq_delta)) cg -= filledq return res
def get_processed_dialog_lines_and_index_to_token(corpus_path_en,corpus_path_de, processed_corpus_path_en,processed_corpus_path_de, token_index_path_en, token_index_path_de): _logger.info('Loading corpus data...') if os.path.isfile(processed_corpus_path_en) and os.path.isfile(token_index_path_en) and os.path.isfile(processed_corpus_path_de) and os.path.isfile(token_index_path_de): _logger.info(processed_corpus_path_en + ' and ' + token_index_path_en + 'and'+processed_corpus_path_de + ' and ' + token_index_path_de +' exist, loading files from disk') processed_dialog_lines_en = IterableSentences(processed_corpus_path_en) processed_dialog_lines_de = IterableSentences(processed_corpus_path_de) processed_dialog_lines_en = get_tokenized_dialog_lines_from_processed_corpus(processed_dialog_lines_en) processed_dialog_lines_de = get_tokenized_dialog_lines_from_processed_corpus(processed_dialog_lines_de) index_to_token_en = get_index_to_token(token_index_path_en) index_to_token_de = get_index_to_token(token_index_path_de) return processed_dialog_lines_en,processed_dialog_lines_de, index_to_token_en, index_to_token_de # continue here if processed corpus and token index are not stored on the disk _logger.info(processed_corpus_path_en + ' and ' + token_index_path_en + " don't exist, compute and save it") _logger.info(processed_corpus_path_de + ' and ' + token_index_path_de + " don't exist, compute and save it") processed_dialog_lines_en, processed_dialog_lines_de, index_to_token_en, index_to_token_de = process_corpus(corpus_path_en,corpus_path_de) processed_dialog_lines_en, processed_dialog_lines_for_save_en = tee(processed_dialog_lines_en) processed_dialog_lines_de, processed_dialog_lines_for_save_de = tee(processed_dialog_lines_de) save_index_to_tokens(index_to_token_en, token_index_path_en) save_index_to_tokens(index_to_token_de, token_index_path_de) save_corpus(processed_dialog_lines_for_save_en, processed_corpus_path_en) save_corpus(processed_dialog_lines_for_save_de, processed_corpus_path_de) return processed_dialog_lines_en,processed_dialog_lines_de, index_to_token_en, index_to_token_de
def __init__(self, startingVerts): self._graph = Graph() Xs = Ys = set() self._startingVerts = [] self._stationWeightFunc = lambda g,v: 0 self._startingVertCoords = startingPts = [(vert.X, vert.Y) for vert in startingVerts] for vert in startingVerts: self._startingVerts.append(self._graph.addVertex(Vertex(vert.X, vert.Y, vert.name))) Xs = Xs.union([vert.X]) Ys = Ys.union([vert.Y]) hananPts = product(*(zip(*startingPts))) for (hananID, (vertX, vertY)) in enumerate(set(hananPts).difference(startingPts)): self._graph.addVertex(Vertex(vertX, vertY, "h"+str(hananID))) ## NB: Make sure these all are in-scope, as we'll need their end-of-loop values below. x = rightX = y = upY = thisVertID = rightVertID = upVertID = rightUpVertID = None Xs = sorted(Xs) Ys = sorted(Ys) XsCurr, XsRight = tee(Xs) next(XsRight, None) for (x, rightX) in zip(XsCurr, XsRight): YsCurr, YsUp = tee(Ys) next(YsUp, None) for (y, upY) in zip(YsCurr, YsUp): thisVertID = self._graph.getVertexID(x, y) rightVertID = self._graph.getVertexID(rightX, y) upVertID = self._graph.getVertexID(x, upY) self._graph.addEdgeWithVertsAndWeight(thisVertID, rightVertID, abs(rightX-x)) self._graph.addEdgeWithVertsAndWeight(thisVertID, upVertID, abs(upY-y)) rightUpVertID = self._graph.getVertexID(rightX, upY) self._graph.addEdgeWithVertsAndWeight(rightVertID, rightUpVertID, abs(upY-y)) self._graph.addEdgeWithVertsAndWeight(upVertID, rightUpVertID, abs(rightX-x))
def window(iter, pre_size=1, post_size=1): """ Given an iterable, return a new iterable which yields triples of (pre, item, post), where pre and post are the items preceeding and following the item (or None if no such item is appropriate). pre and post will always be pre_size and post_size in length. >>> example = window(range(10), pre_size=2) >>> pre, item, post = next(example) >>> pre (None, None) >>> post (1,) >>> next(example) ((None, 0), 1, (2,)) >>> list(example)[-1] ((7, 8), 9, (None,)) """ pre_iter, iter = itertools.tee(iter) pre_iter = itertools.chain((None,) * pre_size, pre_iter) pre_iter = nwise(pre_iter, pre_size) post_iter, iter = itertools.tee(iter) post_iter = itertools.chain(post_iter, (None,) * post_size) post_iter = nwise(post_iter, post_size) next(post_iter, None) return six.moves.zip(pre_iter, iter, post_iter)
def grid(self): ''' returns an array (1 or 2 dimensional) of the center-point of each bin. Useful for converting a histogram to a line or surface plot: 1D histogram: x, y = hist.grid(), hist plot(x, y) 2D histogram: x, y = hist.grid() z = hist surface_plot(x, y, z) ''' if not self.dim() in [1, 2]: raise Exception('only 1D and 2D histograms can return a grid.') if self.dim() is 1: it1, it2 = itertools.tee(iter(self.edges[0])) next(it2) return numpy.array([0.5 * (i1 + i2) for i1, i2 in itertools.izip(it1, it2)]) if self.dim() is 2: grid = [[],[]] for d in [0, 1]: it1, it2 = itertools.tee(iter(self.edges[d])) next(it2) for i1, i2 in itertools.izip(it1, it2): grid[d] += [0.5 * (i1 + i2)] return numpy.meshgrid(grid[0], grid[1])
def vdl(signal, times, delay, initial_value=0.0): """Variable delay line which delays `signal` at 'times' with 'delay'. :param signal: Signal to be delayed. :type signal: Iterator :param delay: Delay. :type delay: Iterator :param initial_value: Sample to yield before first actual sample is yielded due to initial delay. .. note:: Times and delay should have the same unit, e.g. both in samples or both in seconds. """ dt0, delay = cytoolz.peek(delay) times, _times = itertools.tee(times) # Yield initial value before interpolation kicks in # Note that this method, using tee, buffers all samples that will be discarded. # Therefore, room for optimization! n = 0 if initial_value is not None: while next(_times) < dt0: n += 1 yield initial_value times1, times2 = itertools.tee(times) interpolated = interpolate_linear(map(operator.add, times2, delay), signal, times1) yield from cytoolz.drop(n, interpolated) # FIXME: move drop before interpolation, saves memory
def learn(): # preprocess the dialog and get index for its vocabulary # processed_dialog_lines, index_to_token = \ # get_processed_dialog_lines_and_index_to_token(CORPUS_PATH, PROCESSED_CORPUS_PATH, TOKEN_INDEX_PATH) processed_dialog_lines_en, processed_dialog_lines_de, index_to_token_en, index_to_token_de = \ get_processed_dialog_lines_and_index_to_token(CORPUS_PATH_EN, CORPUS_PATH_DE, PROCESSED_CORPUS_PATH_EN, PROCESSED_CORPUS_PATH_DE, TOKEN_INDEX_PATH_EN, TOKEN_INDEX_PATH_DE) # dualize iterator # dialog_lines_for_w2v, dialog_lines_for_nn = tee(processed_dialog_lines) dialog_lines_for_w2v_en, dialog_lines_for_nn_en = tee(processed_dialog_lines_en) dialog_lines_for_w2v_de, dialog_lines_for_nn_de = tee(processed_dialog_lines_de) _logger.info('-----') # use gensim realisatino of word2vec instead of keras embeddings due to extra flexibility w2v_model_en = w2v.get_dialogs_model(W2V_PARAMS_EN, dialog_lines_for_w2v_en) w2v_model_de = w2v.get_dialogs_model(W2V_PARAMS_DE, dialog_lines_for_w2v_de) _logger.info('-----') nn_model = get_nn_model(token_dict_size=len(index_to_token_de)) _logger.info('-----') train_model(nn_model, w2v_model_en, w2v_model_de, dialog_lines_for_nn_en,dialog_lines_for_nn_de, index_to_token_en, index_to_token_de)
def process_corpus(corpus_path_en,corpus_path_de): iterable_dialog_lines_en = IterableSentences(corpus_path_en) iterable_dialog_lines_de = IterableSentences(corpus_path_de) tokenized_dialog_lines_en = get_tokenized_dialog_lines(iterable_dialog_lines_en) tokenized_dialog_lines_de = get_tokenized_dialog_lines(iterable_dialog_lines_de) tokenized_dialog_lines_for_voc_en, tokenized_dialog_lines_for_transform_en = tee(tokenized_dialog_lines_en) tokenized_dialog_lines_for_voc_de, tokenized_dialog_lines_for_transform_de = tee(tokenized_dialog_lines_de) tokens_voc_en = get_tokens_voc(tokenized_dialog_lines_for_voc_en) tokens_voc_de = get_tokens_voc(tokenized_dialog_lines_for_voc_de) transformed_dialog_lines_en = get_transformed_dialog_lines(tokenized_dialog_lines_for_transform_en, tokens_voc_en) transformed_dialog_lines_de = get_transformed_dialog_lines(tokenized_dialog_lines_for_transform_de, tokens_voc_de) # transformed_dialog_lines_en=list(transformed_dialog_lines_en) # transformed_dialog_lines_de=list(transformed_dialog_lines_de) # transformed_dialog_lines_de_sorted= [transformed_dialog_lines_de[i] for i in list(zip(*sorted(zip(transformed_dialog_lines_en, range(len(transformed_dialog_lines_en))), key=lambda x:len(x[0]))))[1]] # transformed_dialog_lines_en_sorted = sorted(transformed_dialog_lines_en, lambda x,y: 1 if len(x)>len(y) else -1 if len(x)<len(y) else 0) # for i in range(len(transformed_dialog_lines_de_sorted)): # print "en sentence length: ",len(transformed_dialog_lines_de_sorted[i]) # print "de sentence length: ",len(transformed_dialog_lines_de_sorted[i]) # _logger.info('Token voc size en = ' + str(len(tokens_voc_en)) + 'Token voc size en = ' + str(len(tokens_voc_de))) index_to_token_en = dict(enumerate(tokens_voc_en)) index_to_token_de = dict(enumerate(tokens_voc_de)) return transformed_dialog_lines_en,transformed_dialog_lines_de, index_to_token_en,index_to_token_de
def aggregate_prefixes( iterables: List[Iterable[str]], delimiter: str = ' ') -> Iterable[Tuple[str, Optional[list]]]: """Aggregate iterables into nested tuples with shared prefixes. The iterable streams will be tee'd and consumed like so: 0 1 2 3 4 ... | | | |/| | | |/| | | |/| | | |/| | | | x x x x x ... """ n_iterables = len(iterables) if n_iterables == 1: yield from iter_util.iter_alphabetical_prefixes( iterables, delimiter=delimiter) return carry, iterables[-1] = itertools.tee(iterables[-1]) for i in reversed(range(1, n_iterables - 1)): carry, iterables[i] = itertools.tee( iter_util.ensure_prefix(iterables[i], carry, delimiter=delimiter)) iterables[0] = iter_util.ensure_prefix( iterables[0], carry, delimiter=delimiter) yield from iter_util.iter_alphabetical_prefixes( iterables, delimiter=delimiter)
def iunzip(iterable, internal_length): # pragma: no cover """Iunzip is the same as zip(*iter) but returns iterators, instead of expand the iterator. Mostly used for large sequence""" _tmp, iterable = itertools.tee(iterable, 2) iters = itertools.tee(iterable, internal_length) return (map(operator.itemgetter(i), it) for i, it in enumerate(iters))
def threewise(iterable): a,_ = itertools.tee(iterable) b,c = itertools.tee(_) next(b, None) next(c, None) next(c, None) return itertools.izip(a,b,c)
def summarise_distributions(distributions, options): measure = "frequency" if options["summary_type"] in ["zipfian","entropy"]: measure = "unsigned_information" kmer_intervals = Distribution.get_intervals(distributions, options["num_processes"]) #print "summarising %s , %s across %s"%(measure, str(kmer_intervals), str(distributions)) print "summarising %s , %d kmers across %s"%(measure, len(kmer_intervals), str(distributions)) sample_measures = Distribution.get_projections(distributions, kmer_intervals, measure, False, options["num_processes"]) zsample_measures = itertools.izip(*sample_measures) sample_name_iter = [tuple([os.path.splitext(os.path.basename(distribution))[0] for distribution in distributions])] zsample_measures = itertools.chain(sample_name_iter, zsample_measures) interval_name_iter = itertools.chain([("kmer_pattern")],kmer_intervals) outfile=open(options["output_filename"], "w") if options["summary_type"] in ["entropy", "frequency"]: zsample_measures_with_rownames = itertools.izip(interval_name_iter, zsample_measures) for interval_measure in zsample_measures_with_rownames: print >> outfile, "%s\t%s"%("%s"%interval_measure[0], string.join((str(item) for item in interval_measure[1]),"\t")) outfile.close() elif options["summary_type"] in ["ranks", "zipfian"]: # duplicate interval_name_iter - needed 3 times interval_name_iter_dup = itertools.tee(interval_name_iter, 3) # triplicate zsample_measures (0 used to get ranks; 1 used to output measures; 3 used to get distances) zsample_measures_dup = itertools.tee(zsample_measures,3) ranks = Distribution.get_rank_iter(zsample_measures_dup[0]) # duplicate ranks (0 used to output; 1 used to get distances) ranks_dup = itertools.tee(ranks, 2) ranks_with_rownames = itertools.izip(interval_name_iter_dup[0], ranks_dup[0]) # output ranks print >> outfile , "*** ranks *** :" for interval_rank in ranks_with_rownames: print >> outfile, "%s\t%s"%("%s"%interval_rank[0], string.join((str(item) for item in interval_rank[1]),"\t")) # output measures print >> outfile , "*** entropies *** :" zsample_measures_with_rownames = itertools.izip(interval_name_iter_dup[1], zsample_measures_dup[1]) for interval_measure in zsample_measures_with_rownames: print >> outfile, "%s\t%s"%("%s"%interval_measure[0], string.join((str(item) for item in interval_measure[1]),"\t")) # get distances print >> outfile , "*** distances *** :" (distance_matrix, point_names_sorted) = Distribution.get_zipfian_distance_matrix(zsample_measures_dup[2], ranks_dup[1]) Distribution.print_distance_matrix(distance_matrix, point_names_sorted, outfile) else: print "warning, unknown summary type %(summary_type)s, no summary available"%options outfile.close()
def _median(iterable, s=sorted, d=truediv, int=int, count=count): i1, i2 = tee(s(iterable)) e = d(count(i1) - 1, 2) p = int(e) if e % 2 == 0: yield slice(i2, p) else: i3, i4 = tee(i2) yield truediv(slice(i3, p) + slice(i4, p + 1), 2)
def build_inputs(args, interval, loops, db_user, db_password, db_host, db_name, db_retries, data_sources): scan_vmstat = None scan_iostat = None inputs = [] devices = iostat_get_devices() parse_counters = { 'iostat' : 0, 'vmstat' : 0, 'my.status' : 0 } if data_sources: f = open(data_sources) args.extend([l[:-1] for l in f.xreadlines()]) expanded_args = [] for arg in ['timestamp', 'timer', 'counter']: parse_args(arg, parse_counters, expanded_args, devices) for dev in devices: for col in iostat_cols: parse_args('iostat.%s.%s' % (dev, col), parse_counters, expanded_args, devices) parse_args('rate.iostat.%s.%s' % (dev, col), parse_counters, expanded_args, devices) for col in vmstat_cols: parse_args('vmstat.%s' % col, parse_counters, expanded_args, devices) parse_args('rate.vmstat.%s' % col, parse_counters, expanded_args, devices) for col in get_my_cols(db_user, db_password, db_host, db_name): parse_args('my.status.%s' % col, parse_counters, expanded_args, devices) parse_args('rate.my.status.%s' % col, parse_counters, expanded_args, devices) for arg in args: parse_args(arg, parse_counters, expanded_args, devices) tee_vmstat, tee_iostat, tee_mystat = None, None, None if parse_counters['vmstat']: scan_vmstat = ScanFork('vmstat -n %d %d' % (interval, loops+1), 2) tee_vmstat = itertools.tee(scan_vmstat, parse_counters['vmstat']) if parse_counters['iostat']: scan_iostat = ScanFork('iostat -kx %d %d' % (interval, loops+1), 0) tee_iostat = itertools.tee(scan_iostat, parse_counters['iostat']) if parse_counters['my.status']: scan_mystat = ScanMysql(db_user, db_password, db_host, db_name, 'SHOW GLOBAL STATUS', db_retries, 'Foo 0') tee_mystat = itertools.tee(scan_mystat, parse_counters['my.status']) # print expanded_args source_counters = { 'iostat' : 0, 'vmstat' : 0, 'my.status' : 0 } for arg in expanded_args: make_data_inputs(arg, inputs, source_counters, interval, db_user, db_password, db_host, db_name, db_retries, tee_vmstat, tee_iostat, tee_mystat) return inputs
def test_tee_bug1(self): import itertools a, b = itertools.tee('abcde') x = a.next() assert x == 'a' c, d = itertools.tee(a) x = c.next() assert x == 'b' x = d.next() assert x == 'b'
def test_tee_bug1(self): import itertools a, b = itertools.tee('abcde') x = next(a) assert x == 'a' c, d = itertools.tee(a) x = next(c) assert x == 'b' x = next(d) assert x == 'b'
def iunzip(iterable): """Iunzip is the same as zip(*iter) but returns iterators, instead of expand the iterator. Mostly used for large sequence See <https://gist.github.com/1063340>. """ from operator import itemgetter _tmp, iterable = itertools.tee(iterable, 2) iters = itertools.tee(iterable, len(_tmp.next())) return (itermap(itemgetter(i), it) for i, it in enumerate(iters))
def analyze(self, keys, total=0): """ :param keys: :param progress: :return: """ key_stat = { "headers": ["Match", "Count", "Useful", "Real", "Ratio", "Encoding", "Min", "Max", "Avg"], "data": [], } progress = tqdm(total=total, mininterval=1, desc="Processing keys", leave=False) for pattern, data in keys.items(): used_bytes_iter, aligned_iter, encoding_iter = tee( progress_iterator((StringEntry(value=x["name"]) for x in data), progress), 3 ) total_elements = len(data) if total_elements == 0: continue aligned = sum(obj.aligned for obj in aligned_iter) used_bytes_generator = (obj.useful_bytes for obj in used_bytes_iter) useful_iter, min_iter, max_iter, mean_iter = tee(used_bytes_generator, 4) prefered_encoding = pref_encoding((obj.encoding for obj in encoding_iter), redis_encoding_id_to_str) min_value = min(min_iter) if total_elements < 2: avg = min_value else: avg = statistics.mean(mean_iter) used_user = sum(useful_iter) stat_entry = [ pattern, total_elements, used_user, aligned, aligned / used_user, prefered_encoding, min_value, max(max_iter), avg, ] key_stat["data"].append(stat_entry) key_stat["data"].sort(key=lambda x: x[1], reverse=True) key_stat["data"].append(make_total_row(key_stat["data"], ["Total:", sum, sum, sum, 0, "", 0, 0, 0])) progress.close() return ["key stats", key_stat]
def triple_wise_periodic(iterable): """s -> (s0,s1,s2), (s1,s2,s3), ..., (sn,s0,s1) modified from example in documentation """ a, _b = tee(iterable) b, c = tee(cycle(_b)) next(b, None) next(c, None) next(c, None) return zip(a, b, c)
def generate_all_graphs(full_graph, detailed=False, min_year=2004, max_year=2014): graph = get_empty_graph() for curnodes, curedges, criterion in generate_all_generators(full_graph, min_year, max_year): # updating graph if not detailed: users, busin = add_nodes_and_edges(full_graph, graph, criterion, curnodes=curnodes, curedges=curedges) yield users, busin, graph else: curnodes, curnodes_b = tee(curnodes) curedges, curedges_b = tee(curedges) users, busin = add_nodes_and_edges(full_graph, graph, criterion, curnodes=curnodes, curedges=curedges) yield users, busin, graph, curnodes_b, curedges_b
def _select_data_matrices(self, iterable_pos, iterable_neg, n_active_learning_iterations=2, size_positive=-1, size_negative=100, lower_bound_threshold_positive=-1, upper_bound_threshold_positive=1, lower_bound_threshold_negative=-1, upper_bound_threshold_negative=1): # select the initial ids simply as the first occurrences if size_positive != -1: positive_ids = range(size_positive) if size_negative != -1: negative_ids = range(size_negative) # iterate: select instances according to current model and create novel # data matrix to fit the model in next round for i in range(n_active_learning_iterations): # make data matrix on selected instances # if this is the first iteration or we need to select positives if i == 0 or size_positive != -1: iterable_pos, iterable_pos_, iterable_pos__ = tee(iterable_pos, 3) if size_positive == -1: # if we take all positives data_matrix_pos = self._data_matrix(iterable_pos_, fit_vectorizer=self.fit_vectorizer) else: # otherwise use selection data_matrix_pos = self._data_matrix(selection_iterator(iterable_pos_, positive_ids), fit_vectorizer=self.fit_vectorizer) # if this is the first iteration or we need to select negatives if i == 0 or size_negative != -1: iterable_neg, iterable_neg_, iterable_neg__ = tee(iterable_neg, 3) if size_negative == -1: # if we take all negatives data_matrix_neg = self._data_matrix(iterable_neg_, fit_vectorizer=False) else: # otherwise use selection data_matrix_neg = self._data_matrix(selection_iterator(iterable_neg_, negative_ids), fit_vectorizer=False) # assemble data matrix data_matrix, y = self._assemble_data_matrix(data_matrix_pos, data_matrix_neg) # stop the fitting procedure at the last-1 iteration and return data_matrix,y if i == n_active_learning_iterations - 1: break # fit the estimator on selected instances self.estimator.fit(data_matrix, y) # use the trained estimator to select the next instances if size_positive != -1: positive_ids = self._bounded_selection(iterable_pos__, size=size_positive, lower_bound_threshold=lower_bound_threshold_positive, upper_bound_threshold=upper_bound_threshold_positive) if size_negative != -1: negative_ids = self._bounded_selection(iterable_neg__, size=size_negative, lower_bound_threshold=lower_bound_threshold_negative, upper_bound_threshold=upper_bound_threshold_negative) return data_matrix, y
def build_nn_data(data, num_questions, use_correct=True, use_hints=False): """ Build data ready for RNN input. :param DataFrame data: User interactions for all users in DataFrame format as returned by loading functions in this package. :param int num_questions: number of questions in the full dataset :param bool use_correct: If True, records responses (before compression) as a 2 * num_questions one-hot vector where one dimension corresponds to correct and one dimension corresponds to incorrect. If False, records responses (before compression) as a num_questions one-hot vector where each dimension corresponds to having *answered* a question, whether correctly or incorrectly. :param bool use_hints: If True, records responses ternarily: Correct, Wrong with No Hints, and Used a Hint. :return: list of all users data ready for RNN input. :rtype: list[UserData] """ all_users_data = [] data.sort([USER_IDX_KEY, TIME_IDX_KEY], inplace=True) # use_hints => use_correct use_correct = use_correct or use_hints for user_id, user in data.groupby(USER_IDX_KEY): x = [] # Input X denoting position for one hot y = [] # Mask Y to mask the probabilities all questions except the next one t = [] # The truth about the correctness of the next question xiter, yiter = its.tee(user[ITEM_IDX_KEY].values) next(yiter, None) this_correct_iter, next_correct_iter = its.tee(user[CORRECT_KEY].values) next(next_correct_iter, None) if use_hints: hints_iter = user[HINT_COUNT_KEY].values else: hints_iter = its.cycle([0]) for this_skill, next_skill, this_correct, next_correct, hint in its.izip( xiter, yiter, this_correct_iter, next_correct_iter, hints_iter): # The first num_questions dimensions refer to incorrect responses, the # second num_questions dimensions to correct responses. *Unless* # use_correct is False, in which case, only num_questions dimensions # are used, one for answering (correctly or incorrectly) each question x.append(this_skill + num_questions * this_correct * (hint == 0) * use_correct + 2 * num_questions * (hint > 0) * use_hints) y.append(next_skill) t.append(next_correct) # Append it to a list all_users_data.append(UserData(length=len(x), history=x, next_answer=y, truth=t)) return all_users_data
def test(): size = 10 randomiterator, backup = itertools.tee(rlist(size, 0, size)) sortediterator = iterator_mergesort(randomiterator, size) try: assert list(sortediterator) == sorted(backup) except AssertionError as e: print "Test failed." randomlist, randomlist1 = itertools.tee(rlist(10,0,10)) print list(iterator_mergesort(randomlist, 10)) print sorted(randomlist1) sys.exit(0) print "All tests pass."
def pairwise(iterable): "s -> (s0,s1), (s1,s2), (s2,s3), ..." split1, split2 = itertools.tee(iterable) a, b = itertools.tee(split1) test1, test2 = itertools.tee(split2) next(b, None) if next(test1, None) is None: ret = py23_zip(a, b) # Returns empty list elif next(test2, None) is not None and next(test2, None) is None: ret = py23_zip(a, [None]) # Return at least one value else: ret = py23_zip(a, b) del test1, test2, split2 return ret
def iter_qsort(it): it, peek = tee(it) try: next(peek), next(peek) except StopIteration: yield next(it) else: pivot = next(it) lesser, greater = tee(it) for n in iter_qsort(n for n in lesser if n <= pivot): yield n yield pivot for n in iter_qsort(n for n in greater if n > pivot): yield n
def test_tee_optimization(self): import itertools a, b = itertools.tee(iter('foobar')) c, d = itertools.tee(b) assert c is b assert a is not c assert a is not d assert c is not d res = list(a) assert res == list('foobar') res = list(c) assert res == list('foobar') res = list(d) assert res == list('foobar')
def sine_cosine_series(): def deferred_sin(): for i in sinx_temp: yield i def deferred_cos(): for i in cosx_temp: yield i sinx_result, sinx_copy1 = tee(deferred_sin(), 2) cosx_result, cosx_copy1 = tee(deferred_cos(), 2) sinx_temp = intgpower(cosx_copy1) cosx_temp = minuspower(constpower(1), intgpower(sinx_copy1)) return sinx_result, cosx_result
def __init__(self, network_params, weight_params, stdp_params, total_time, DoG_params=None, spike_times_learn=None, spike_times_train=None, spike_times_test=None, y_train=None, y_test=None, device='GPU', tau=5): """ Initialisaition of SDNN Input: - network_params: A list of dictionaries with the following keys: -'Type': A string specifying which kind of layer this is (either 'input', 'conv' and 'pool') -'num_filters': an int specifying the depth (number of filters) of this layer -'filter_size': an int specifying the height and width of the filter window for the previous layer to this layer (only on 'conv' and 'pool') -'th': an np.float32 specifying the threshold of this layer (only on 'conv' and 'pool') -'stride': an int specifying the stride for this layer (only on 'pool') -'pad': an int specifying the pad for this layer (only on 'input') -'H_layer': an int specifying the height of this layer (only on 'input') -'W_layer': an int specifying the width of this layer (only on 'input') - weight_params: A dictionary with the following keys: - 'mean': the mean for initialising the weights - 'std': the std for initialising the weights - stdp_params: A dictionary with the following keys: - 'max_iter': an int specifyng the maximum number of iterations allowed on learning - 'max_learn_iter': a list of ints specifying the maximum number of iterations allowed for training each layer (len = number of layers) - 'stdp_per_layer': a list of ints specifying the maximum number of STDP updates per layer (len = number of layers) - 'offset_STDP': a list of ints specifying the STDP ofset per leayer updates per layer (len = number of layers) - 'a_minus': an np.float32 numpy array specifying the learning rate when no causality - 'a_plus': an np.float32 numpy array specifying the learning rate when there is causality - total_time: An int specifying the number of time steps per image - spike_times_learn: A list of strings with a valid absolute or relative path to the folders with the learning .jpg images OR An uint8 array with the learning spike times of shape (N_lr, H_in, W_in, M_in). Axis 0 is each of the images - spike_times_train: A list of strings with a valid absolute or relative path to the folders with the training .jpg images OR An uint8 array with the training spike times of shape (N_tr, H_in, W_in, M_in). Axis 0 is each of the images - spike_times_test: A list of strings with a valid absolute or relative path to the folders with the testing .jpg images OR An uint8 array with the testing spike times of shape (N_ts, H_in, W_in, M_in). Axis 0 is each of the images - DoG_params: None OR A dictionary with the following keys: -'img_size': A tuple of integers with the dimensions to which the images are to be resized -'DoG_size': An int with the size of the DoG filter window size -'std1': A float with the standard deviation 1 for the DoG filter -'std2': A float with the standard deviation 2 for the DoG filter - tau: For STDP window """ # --------------------------- DoG Filter Parameters -------------------# if DoG_params is not None: self.DoG = True self.img_size = DoG_params['img_size'] self.filt = DoG(DoG_params['DoG_size'], DoG_params['std1'], DoG_params['std2']) else: self.DoG = False # --------------------------- Network Initialisation -------------------# # Total time and number of layers self.num_layers = len(network_params) self.learnable_layers = [] self.total_time = total_time # Layers Initialisation self.network_struc = [] self.init_net_struc(network_params) self.layers = [] self.init_layers() # Weights Initialisation self.weight_params = weight_params self.weights = [] self.init_weights() # Dimension Check self.check_dimensions() # ---------------------------Learning Paramters -------------------# # Learning layer parameters self.max_iter = stdp_params['max_iter'] self.learning_layer = self.learnable_layers[0] self.max_learn_iter = stdp_params['max_learn_iter'] self.curr_lay_idx = 0 self.counter = 0 self.curr_img = 0 #STDP params self.stdp_per_layer = stdp_params['stdp_per_layer'] self.stdp_a_minus = stdp_params['a_minus'] self.stdp_a_plus = stdp_params['a_plus'] self.offsetSTDP = stdp_params['offset_STDP'] # --------------------------- CUDA Parameters -------------------# self.device = device if self.device == 'GPU': self.thds_per_dim = 8 # (Use 8 if doesn't work) # --------------------------- Input spike times -------------------# # Generate Iterators with the full path to the images in each set OR reference the spike times if self.DoG: self.spike_times_learn, self.y_learn = self.gen_iter_paths(spike_times_learn) self.spike_times_train, self.y_train = self.gen_iter_paths(spike_times_train) self.spike_times_test, self.y_test = self.gen_iter_paths(spike_times_test) self.num_img_learn = self.y_learn.size self.num_img_train = self.y_train.size self.num_img_test = self.y_test.size self.spike_times_train, self.learn_buffer = tee(self.spike_times_train) else: self.spike_times_learn = spike_times_learn self.num_img_learn = spike_times_learn.shape[0] self.spike_times_train = spike_times_train self.num_img_train = spike_times_train.shape[0] self.spike_times_test = spike_times_test self.num_img_test = spike_times_test.shape[0] self.y_train = y_train self.y_test = y_test # --------------------------- Output features -------------------# self.features_train = [] self.features_test = [] self.tau = tau
def eachCons(iterable): a, b = tee(iterable) next(b, None) return izip(a, b)
def _get_choices(self): if hasattr(self._choices, 'next'): choices, self._choices = tee(self._choices) return choices else: return self._choices
def consecutive_pairs(iterable): "s -> (s0,s1), (s1,s2), (s2, s3), ..." a, b = itertools.tee(iterable) next(b, None) return zip(a, b)
def pipe( self, texts, as_tuples=False, n_threads=-1, batch_size=1000, disable=[], cleanup=False, component_cfg=None, n_process=1, ): """Process texts as a stream, and yield `Doc` objects in order. texts (iterator): A sequence of texts to process. as_tuples (bool): If set to True, inputs should be a sequence of (text, context) tuples. Output will then be a sequence of (doc, context) tuples. Defaults to False. batch_size (int): The number of texts to buffer. disable (list): Names of the pipeline components to disable. cleanup (bool): If True, unneeded strings are freed to control memory use. Experimental. component_cfg (dict): An optional dictionary with extra keyword arguments for specific components. n_process (int): Number of processors to process texts, only supported in Python3. If -1, set `multiprocessing.cpu_count()`. YIELDS (Doc): Documents in the order of the original text. DOCS: https://spacy.io/api/language#pipe """ if is_python2 and n_process != 1: user_warning(Warnings.W023) n_process = 1 if n_threads != -1: deprecation_warning(Warnings.W016) if n_process == -1: n_process = mp.cpu_count() if as_tuples: text_context1, text_context2 = itertools.tee(texts) texts = (tc[0] for tc in text_context1) contexts = (tc[1] for tc in text_context2) docs = self.pipe( texts, batch_size=batch_size, disable=disable, n_process=n_process, component_cfg=component_cfg, ) for doc, context in izip(docs, contexts): yield (doc, context) return if component_cfg is None: component_cfg = {} pipes = ( [] ) # contains functools.partial objects to easily create multiprocess worker. for name, proc in self.pipeline: if name in disable: continue kwargs = component_cfg.get(name, {}) # Allow component_cfg to overwrite the top-level kwargs. kwargs.setdefault("batch_size", batch_size) if hasattr(proc, "pipe"): f = functools.partial(proc.pipe, **kwargs) else: # Apply the function, but yield the doc f = functools.partial(_pipe, proc=proc, kwargs=kwargs) pipes.append(f) if n_process != 1: docs = self._multiprocessing_pipe(texts, pipes, n_process, batch_size) else: # if n_process == 1, no processes are forked. docs = (self.make_doc(text) for text in texts) for pipe in pipes: docs = pipe(docs) # Track weakrefs of "recent" documents, so that we can see when they # expire from memory. When they do, we know we don't need old strings. # This way, we avoid maintaining an unbounded growth in string entries # in the string store. recent_refs = weakref.WeakSet() old_refs = weakref.WeakSet() # Keep track of the original string data, so that if we flush old strings, # we can recover the original ones. However, we only want to do this if we're # really adding strings, to save up-front costs. original_strings_data = None nr_seen = 0 for doc in docs: yield doc if cleanup: recent_refs.add(doc) if nr_seen < 10000: old_refs.add(doc) nr_seen += 1 elif len(old_refs) == 0: old_refs, recent_refs = recent_refs, old_refs if original_strings_data is None: original_strings_data = list(self.vocab.strings) else: keys, strings = self.vocab.strings._cleanup_stale_strings( original_strings_data) self.vocab._reset_cache(keys, strings) self.tokenizer._reset_cache(keys) nr_seen = 0
import itertools it = itertools.chain([1, 2, 3], [4, 5, 6]) print(list(it)) # it = itertools.repeat('안녕', 3) print(list(it)) # it = itertools.cycle([1, 2]) result = [next(it) for _ in range(10)] print(result) # it1, it2, it3 = itertools.tee(['하나', '둘'], 3) print(list(it1)) print(list(it2)) print(list(it3)) # keys = ['하나', '둘', '셋'] values = [1, 2] normal = list(zip(keys, values)) print('zip:', normal) it = itertools.zip_longest(keys, values, fillvalue='없음') longest = list(it) print('zip_longest:', longest)
def main(filename): with open(filename) as boarding_passes: data_for_question_1, data_for_question_2 = itertools.tee( boarding_passes) print(highest_seat_id(of_all_the_ids_from(data_for_question_1))) print(our_seat(of_all_the_ids_from(data_for_question_2)))
def partition(pred, iterable): """From itertools documentation""" t1, t2 = tee(iterable) return list(filterfalse(pred, t1)), list(filter(pred, t2))
def pairwise(iterable): "s -> (s0,s1), (s1,s2), (s2, s3), ..." a, b = itertools.tee(iterable) next(b, None) return zip(a, b)
def prev_this_next(it): a, b, c = itertools.tee(it,3) next(c) return zip(itertools.chain([None], a), b, itertools.chain(c, [None]))
def _pairwise(iterable): """s -> (s0,s1), (s1,s2), (s2, s3), ...""" a, b = tee(iterable) next(b, None) return zip(a, b)
def result_key_iters(self): teed_results = tee(self, len(self.result_keys)) return [ResultKeyIterator(i, result_key) for i, result_key in zip(teed_results, self.result_keys)]
def lookahead(it): it1, it2 = tee(iter(it)) next(it2) return zip_longest(it1, it2)
def fill_filekind(self, fk): now = datetime.utcnow() headers = { 'status': self.status, 'date': now.strftime('%a, %d %b %Y %H:%M:%S GMT'), 'server': 'Python/HTTPretty', 'connection': 'close', } if self.forcing_headers: headers = self.forcing_headers if self.adding_headers: headers.update(self.normalize_headers(self.adding_headers)) headers = self.normalize_headers(headers) status = headers.get('status', self.status) if self.body_is_callable: status, headers, self.body = self.callable_body( self.request, self.info.full_url(), headers) headers = self.normalize_headers(headers) # TODO: document this behavior: if 'content-length' not in headers: headers.update({'content-length': len(self.body)}) string_list = [ 'HTTP/1.1 %d %s' % (status, STATUSES[status]), ] if 'date' in headers: string_list.append('date: %s' % headers.pop('date')) if not self.forcing_headers: content_type = headers.pop('content-type', 'text/plain; charset=utf-8') content_length = headers.pop('content-length', self.body_length) string_list.append('content-type: %s' % content_type) if not self.streaming: string_list.append('content-length: %s' % content_length) server = headers.pop('server', None) if server: string_list.append('server: %s' % server) for k, v in headers.items(): string_list.append('{0}: {1}'.format(k, v), ) for item in string_list: fk.write(utf8(item) + b'\n') fk.write(b'\r\n') if self.streaming: self.body, body = itertools.tee(self.body) for chunk in body: fk.write(utf8(chunk)) else: fk.write(utf8(self.body)) fk.seek(0)
def pairwise(iterable): # generates tuples of words in order, i.e Sentence = w1,w2,w3,...wn --> pairwise(sentence) = (w1,w2),(w2,w3),(w3,w4),... a, b = itertools.tee(iterable) next(b, None) return zip(a, b)
def pairwise(iterable): from itertools import tee a, b = tee(iterable) next(b, None) return zip(a, b)
def train(self, data_iterator): ''' Train a pytorch model on a worker and send asynchronous updates to parameter server ''' print(self.master_url) print(self.optimizer_config) data_all, target_all = tee(data_iterator, 2) x_train = np.asarray([x for x, y in data_all]) y_train = np.asarray([y for x, y in target_all]) # print(self.frequency) # print('-------worker open----') # f = open('model.pkl', 'wb') # print(len(self.serialized_network)) # f.write(self.serialized_network) # f.close() # print('-----close f') # print(self.serialized_network.state_dict()) # print('`````````' if x_train.size == 0: return # print('picke load model') # model = pickle.loads(self.serialized_network) # print('picke load model hhh') # model = torch.load('model.pkl') # model = nn.Sequential(OrderedDict([ # ('conv1', nn.Conv2d(1, 10, kernel_size=(5, 5), stride=(1, 1))), # ('conv2', nn.Conv2d(10, 20, kernel_size=(5, 5), stride=(1, 1))), # ('conv2_drop', nn.Dropout2d(p=0.5)), # ('fc1', nn.Linear(in_features=320, out_features=50, bias=True)), # ('fc2', nn.Linear(in_features=50, out_features=10, bias=True)) # ])) model = self.network epoch_num = self.train_config['epoch'] batch_size = self.train_config['batch_size'] sample_num = x_train.shape[0] batch_num = int(np.ceil(sample_num / batch_size)) - 5 use_gpu = torch.cuda.is_available() if use_gpu: model.cuda() '''grained of updates, frequency_num controls more concise grain of asyn training, leave for future work.''' cnt = 0 if self.frequency == 'epoch': for epoch in range(epoch_num): state_dict_before_training = self.get_post.get_server_state_dict( ) print('get_server_state_dict') # print(state_dict_before_training) model.load_state_dict(state_dict_before_training) optimizer = get_optimizer(self.worker_optimizer, self.optimizer_config, model.parameters()) model.train() for idx in range(batch_num): data = x_train[idx * batch_size:min((idx + 1) * batch_size, sample_num)] target = y_train[idx * batch_size:min((idx + 1) * batch_size, sample_num)] # print(target) # print(type(target)) if use_gpu: data = torch.from_numpy(data).cuda() target = torch.from_numpy(target).cuda() data = Variable(torch.from_numpy(data)) target = Variable(torch.from_numpy(target)) # print(data.size()) # print(target.size()) optimizer.zero_grad() # print(optimizer) output = model(data) # print(output) # print(target) loss = get_loss(self.loss_function, output, target, use_gpu) # loss = F.nll_loss(output, target) # print(idx, ' ',loss) loss.backward() optimizer.step() # optimizer.zero_grad() cnt = cnt + 1 if cnt == self.frequency_num: eval_output = model(data) eval_loss = get_loss(self.loss_function, eval_output, target, use_gpu) print(epoch, '------------', eval_loss) state_dict_after_training = model.state_dict() # print(state_dict_after_training, 'AAAAAAAAAAAAAAAAAAAAAAAAA') updates = compute_updates(state_dict_before_training, state_dict_after_training) # print(updates, 'update delta to parameter server~~') self.get_post.post_updates_to_server(updates) cnt = 0 elif self.frequency == 'batch': for epoch in range(epoch_num): for idx in range(batch_num): state_dict_before_training = self.get_post.get_server_state_dict( ) model.load_state_dict(state_dict_before_training) optimizer = get_optimizer(self.worker_optimizer, self.optimizer_config, model.parameters()) model.train() data = x_train[idx * batch_size:min((idx + 1) * batch_size, sample_num)] target = y_train[idx * batch_size:min((idx + 1) * batch_size, sample_num)] if use_gpu: data = torch.from_numpy(data).cuda() target = torch.from_numpy(target).cuda() data = Variable(torch.Tensor(data)) target = Variable(torch.Tensor(target)) optimizer.zero_grad() output = model(data) loss = get_loss(self.loss_function, output, target, use_gpu) loss.backward() optimizer.step() cnt = cnt + 1 if cnt == self.frequency_num: eval_output = model(data) eval_loss = get_loss(self.loss_function, eval_output, target, use_gpu) print(epoch, '------------', eval_loss) state_dict_after_training = model.state_dict() updates = compute_updates(state_dict_before_training, state_dict_after_training) self.get_post.post_updates_to_server(updates) cnt = 0 else: print('please choose the frequency of training') yield []
def fit(self, subjects, y=None): """Compute cross-validated group-sparse precisions. Parameters ---------- subjects : list of numpy.ndarray with shapes (n_samples, n_features) input subjects. Each subject is a 2D array, whose columns contain signals. Sample number can vary from subject to subject, but all subjects must have the same number of features (i.e. of columns.) Returns ------- self: GroupSparseCovarianceCV the object instance itself. """ # Empirical covariances emp_covs, n_samples = \ empirical_covariances(subjects, assume_centered=False) n_subjects = emp_covs.shape[2] # One cv generator per subject must be created, because each subject # can have a different number of samples from the others. cv = [] for k in range(n_subjects): cv.append(check_cv( self.cv, np.ones(subjects[k].shape[0]), classifier=False ).split(subjects[k]) ) path = list() # List of (alpha, scores, covs) n_alphas = self.alphas if isinstance(n_alphas, collections.Sequence): alphas = list(self.alphas) n_alphas = len(alphas) n_refinements = 1 else: n_refinements = self.n_refinements alpha_1, _ = compute_alpha_max(emp_covs, n_samples) alpha_0 = 1e-2 * alpha_1 alphas = np.logspace(np.log10(alpha_0), np.log10(alpha_1), n_alphas)[::-1] covs_init = itertools.repeat(None) # Copying the cv generators to use them n_refinements times. cv_ = izip(*cv) for i, (this_cv) in enumerate(itertools.tee(cv_, n_refinements)): # Compute the cross-validated loss on the current grid train_test_subjs = [] for train_test in this_cv: assert(len(train_test) == n_subjects) train_test_subjs.append(list(zip(*[(subject[train, :], subject[test, :]) for subject, (train, test) in zip(subjects, train_test)]))) if self.early_stopping: probes = [EarlyStopProbe(test_subjs, verbose=max(0, self.verbose - 1)) for _, test_subjs in train_test_subjs] else: probes = itertools.repeat(None) this_path = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)( delayed(group_sparse_covariance_path)( train_subjs, alphas, test_subjs=test_subjs, max_iter=self.max_iter_cv, tol=self.tol_cv, verbose=max(0, self.verbose - 1), debug=self.debug, # Warm restart is useless with early stopping. precisions_init=None if self.early_stopping else prec_init, probe_function=probe) for (train_subjs, test_subjs), prec_init, probe in zip(train_test_subjs, covs_init, probes)) # this_path[i] is a tuple (precisions_list, scores) # - scores: scores obtained with the i-th folding, for each value # of alpha. # - precisions_list: corresponding precisions matrices, for each # value of alpha. precisions_list, scores = list(zip(*this_path)) # now scores[i][j] is the score for the i-th folding, j-th value of # alpha (analoguous for precisions_list) precisions_list = list(zip(*precisions_list)) scores = [np.mean(sc) for sc in zip(*scores)] # scores[i] is the mean score obtained for the i-th value of alpha. path.extend(list(zip(alphas, scores, precisions_list))) path = sorted(path, key=operator.itemgetter(0), reverse=True) # Find the maximum score (avoid using the built-in 'max' function # to have a fully-reproducible selection of the smallest alpha in # case of equality) best_score = -np.inf last_finite_idx = 0 for index, (alpha, this_score, _) in enumerate(path): if this_score >= .1 / np.finfo(np.float).eps: this_score = np.nan if np.isfinite(this_score): last_finite_idx = index if this_score >= best_score: best_score = this_score best_index = index # Refine the grid if best_index == 0: # We do not need to go back: we have chosen # the highest value of alpha for which there are # non-zero coefficients alpha_1 = path[0][0] alpha_0 = path[1][0] covs_init = path[0][2] elif (best_index == last_finite_idx and not best_index == len(path) - 1): # We have non-converged models on the upper bound of the # grid, we need to refine the grid there alpha_1 = path[best_index][0] alpha_0 = path[best_index + 1][0] covs_init = path[best_index][2] elif best_index == len(path) - 1: alpha_1 = path[best_index][0] alpha_0 = 0.01 * path[best_index][0] covs_init = path[best_index][2] else: alpha_1 = path[best_index - 1][0] alpha_0 = path[best_index + 1][0] covs_init = path[best_index - 1][2] alphas = np.logspace(np.log10(alpha_1), np.log10(alpha_0), len(alphas) + 2) alphas = alphas[1:-1] if n_refinements > 1: logger.log("[GroupSparseCovarianceCV] Done refinement " "% 2i out of %i" % (i + 1, n_refinements), verbose=self.verbose) path = list(zip(*path)) cv_scores_ = list(path[1]) alphas = list(path[0]) self.cv_scores_ = np.array(cv_scores_) self.alpha_ = alphas[best_index] self.cv_alphas_ = alphas # Finally, fit the model with the selected alpha logger.log("Final optimization", verbose=self.verbose) self.covariances_ = emp_covs self.precisions_ = _group_sparse_covariance( emp_covs, n_samples, self.alpha_, tol=self.tol, max_iter=self.max_iter, verbose=max(0, self.verbose - 1), debug=self.debug) return self
from itertools import chain, tee # chaining is used when iterating throu multiple iterables/iterators # teeing is used hen we need to iteratoe over an iterator multiple times l1 = (i**2 for i in range(4)) l2 = (i**2 for i in range(4, 8)) l3 = (i**2 for i in range(8, 12)) # for gen in l1, l2, l3: # for item in gen: # print(item) l = [l1, l2, l3] for item in chain(*l): print(item) def squares(n): for i in range(n): yield i**2 s = squares(10) print(list(s)) print(s) iters = tee(s, 5) print(list(iters))
def pairwise(iterable): # from https://docs.python.org/3/library/itertools.html "s -> (s0,s1), (s1,s2), (s2, s3), ..." a, b = tee(iterable) next(b, None) return zip(a, b)
def minmax(seq, *, key=lambda x: x): """Return the minimum and maximum values in the input.""" iterator1, iterator2 = tee(seq) return MinMax(min(iterator1, key=key), max(iterator2, key=key))
def fill_filekind(self, fk, sent_data): now = datetime.utcnow() headers = { 'status': self.status, 'date': now.strftime('%a, %d %b %Y %H:%M:%S GMT'), 'server': 'Python/HTTPretty', 'connection': 'close', } if callable(self.body): _resp = self.body(sent_data) if not isinstance(_resp, str) and len(_resp) == 2: body = _resp[0] headers.update(_resp[1]) else: body = _resp self.body_length = len(body) else: body = self.body if self.forcing_headers: headers = self.forcing_headers if self.adding_headers: headers.update(self.normalize_headers(self.adding_headers)) headers = self.normalize_headers(headers) status = headers.get('status', self.status) string_list = [ 'HTTP/1.1 %d %s' % (status, STATUSES[status]), ] if 'date' in headers: string_list.append('date: %s' % headers.pop('date')) if not self.forcing_headers: content_type = headers.pop('content-type', 'text/plain; charset=utf-8') content_length = headers.pop('content-length', self.body_length) string_list.append('content-type: %s' % content_type) if not self.streaming: string_list.append('content-length: %s' % content_length) string_list.append('server: %s' % headers.pop('server')) for k, v in headers.items(): string_list.append('{0}: {1}'.format(k, v), ) for item in string_list: fk.write(utf8(item) + b'\n') fk.write(b'\r\n') if self.streaming: body, b = itertools.tee(body) for chunk in b: fk.write(utf8(chunk)) else: fk.write(utf8(body)) fk.seek(0)
def pipe(self, texts, as_tuples=False, n_threads=2, batch_size=1000, disable=[], cleanup=False): """Process texts as a stream, and yield `Doc` objects in order. texts (iterator): A sequence of texts to process. as_tuples (bool): If set to True, inputs should be a sequence of (text, context) tuples. Output will then be a sequence of (doc, context) tuples. Defaults to False. n_threads (int): Currently inactive. batch_size (int): The number of texts to buffer. disable (list): Names of the pipeline components to disable. cleanup (bool): If True, unneeded strings are freed, to control memory use. Experimental. YIELDS (Doc): Documents in the order of the original text. EXAMPLE: >>> texts = [u'One document.', u'...', u'Lots of documents'] >>> for doc in nlp.pipe(texts, batch_size=50, n_threads=4): >>> assert doc.is_parsed """ if as_tuples: text_context1, text_context2 = itertools.tee(texts) texts = (tc[0] for tc in text_context1) contexts = (tc[1] for tc in text_context2) docs = self.pipe(texts, n_threads=n_threads, batch_size=batch_size, disable=disable) for doc, context in izip(docs, contexts): yield (doc, context) return docs = (self.make_doc(text) for text in texts) for name, proc in self.pipeline: if name in disable: continue if hasattr(proc, 'pipe'): docs = proc.pipe(docs, n_threads=n_threads, batch_size=batch_size) else: # Apply the function, but yield the doc docs = _pipe(proc, docs) # Track weakrefs of "recent" documents, so that we can see when they # expire from memory. When they do, we know we don't need old strings. # This way, we avoid maintaining an unbounded growth in string entries # in the string store. recent_refs = weakref.WeakSet() old_refs = weakref.WeakSet() # Keep track of the original string data, so that if we flush old strings, # we can recover the original ones. However, we only want to do this if we're # really adding strings, to save up-front costs. original_strings_data = None nr_seen = 0 for doc in docs: yield doc if cleanup: recent_refs.add(doc) if nr_seen < 10000: old_refs.add(doc) nr_seen += 1 elif len(old_refs) == 0: old_refs, recent_refs = recent_refs, old_refs if original_strings_data is None: original_strings_data = list(self.vocab.strings) else: keys, strings = self.vocab.strings._cleanup_stale_strings( original_strings_data) self.vocab._reset_cache(keys, strings) self.tokenizer._reset_cache(keys) nr_seen = 0
def pairwise(iterable): a, b = tee(iterable) next(b, None) return zip(a, b)
def exec(self): # ----------------------------------------------- # itertools.starmap() # ---------------------- # グループ化済みの iterable に対して function を適用する。 # 例えば、zipした後の結果に対して、更に function 適用するなど。 # # つまり、以下のような感じ。 # # l1 = [1, 2, 3] # l2 = [9, 8, 7] # l3 = list(zip(l1, l2)) ==> [(1,9), (2,8), (3,7)] # # l3に対して operator.add で starmapする # list(itertools.starmap(operator.add, l3)) # # 結果は [10, 10, 10] となる。 # つまり、以下を実施したのと同じこと # # for item in l3: # operator.add(*item) # # なので、名前が starmap となっている # ----------------------------------------------- hr('it.starmap()') list01 = [9, 8, 7] list02 = [1, 2, 3] list03 = list(zip(list01, list02)) starmap = it.starmap(ope.sub, list03) pr('it.starmap', list(starmap)) list04 = list(zip(list01, list02, *list03)) pr('it.starmap', list(it.starmap(lambda *args: sum(args), list04))) # ----------------------------------------------- # itertools.takewhile() # ---------------------- # 指定した条件を満たす間、要素を返す。 # dropwhile() の 逆。 # # なので、一度でも条件から外れた場合、それ以降に # 条件を満たす値があっても要素は返らない。 # ----------------------------------------------- hr('it.takewhile()') list05 = sorted(it.chain(list01, list02)) pr('list05', list05) takewhile = it.takewhile(lambda x: x < 5, list05) pr('it.takewhile', list(takewhile)) # ----------------------------------------------- # itertools.tee() # ---------------------- # 指定された iterable を複数の独立した iterable にして返す。 # つまり、n=2 とすると、元の iterable を複製した # 二つの iterable が取得できる。(tuple(iterable, iterable)) # # 公式ドキュメントに記載されているように、一度 tee() を # 使用して分割した original iterable は、内部状態を共有しているので # もう別の場所では利用しないほうがいい。 # # 引用: # Once tee() has made a split, # the original iterable should not be used anywhere else; # otherwise, the iterable could get advanced # without the tee objects being informed. # ----------------------------------------------- hr('it.tee()') list06 = list('helloworld') it_tee = it.tee(list06, 2) it_asc, it_desc = it_tee[0], reversed(list(it_tee[-1])) for it01, it02 in zip(it_asc, it_desc): pr('it.tee', f'{it01}, {it02}')
def pairwise(iterable): "s -> (s0,s1), (s1,s2), (s2, s3), ..." from itertools import tee a, b = tee(iterable) next(b, None) return list(zip(a, b))
def _make_pair_range(N): from itertools import tee i, j = tee(range(-1, N)) next(j, None) return zip(i, j)
def window(seq, n): els = tee(seq, n) for i, el in enumerate(els): for _ in range(i): next(el, None) return zip(*els)
def concat_niimgs(niimgs, dtype=np.float32, ensure_ndim=None, memory=Memory(cachedir=None), memory_level=0, auto_resample=False, verbose=0): """Concatenate a list of 3D/4D niimgs of varying lengths. The niimgs list can contain niftis/paths to images of varying dimensions (i.e., 3D or 4D) as well as different 3D shapes and affines, as they will be matched to the first image in the list if auto_resample=True. Parameters ---------- niimgs: iterable of Niimg-like objects or glob pattern See http://nilearn.github.io/manipulating_images/input_output.html Niimgs to concatenate. dtype: numpy dtype, optional the dtype of the returned image ensure_ndim: integer, optional Indicate the dimensionality of the expected niimg. An error is raised if the niimg is of another dimensionality. auto_resample: boolean Converts all images to the space of the first one. verbose: int Controls the amount of verbosity (0 means no messages). memory : instance of joblib.Memory or string Used to cache the resampling process. By default, no caching is done. If a string is given, it is the path to the caching directory. memory_level : integer, optional Rough estimator of the amount of memory used by caching. Higher value means more memory for caching. Returns ------- concatenated: nibabel.Nifti1Image A single image. See Also -------- nilearn.image.index_img """ from ..image import new_img_like # avoid circular imports target_fov = 'first' if auto_resample else None # We remove one to the dimensionality because of the list is one dimension. ndim = None if ensure_ndim is not None: ndim = ensure_ndim - 1 # If niimgs is a string, use glob to expand it to the matching filenames. niimgs = _resolve_globbing(niimgs) # First niimg is extracted to get information and for new_img_like first_niimg = None iterator, literator = itertools.tee(iter(niimgs)) try: first_niimg = check_niimg(next(literator), ensure_ndim=ndim) except StopIteration: raise TypeError('Cannot concatenate empty objects') except DimensionError as exc: # Keep track of the additional dimension in the error exc.increment_stack_counter() raise # If no particular dimensionality is asked, we force consistency wrt the # first image if ndim is None: ndim = len(first_niimg.shape) if ndim not in [3, 4]: raise TypeError('Concatenated images must be 3D or 4D. You gave a ' 'list of %dD images' % ndim) lengths = [first_niimg.shape[-1] if ndim == 4 else 1] for niimg in literator: # We check the dimensionality of the niimg try: niimg = check_niimg(niimg, ensure_ndim=ndim) except DimensionError as exc: # Keep track of the additional dimension in the error exc.increment_stack_counter() raise lengths.append(niimg.shape[-1] if ndim == 4 else 1) target_shape = first_niimg.shape[:3] if dtype == None: dtype = first_niimg.get_data().dtype data = np.ndarray(target_shape + (sum(lengths), ), order="F", dtype=dtype) cur_4d_index = 0 for index, (size, niimg) in enumerate( izip( lengths, _iter_check_niimg(iterator, atleast_4d=True, target_fov=target_fov, memory=memory, memory_level=memory_level))): if verbose > 0: if isinstance(niimg, _basestring): nii_str = "image " + niimg else: nii_str = "image #" + str(index) print("Concatenating {0}: {1}".format(index + 1, nii_str)) data[..., cur_4d_index:cur_4d_index + size] = niimg.get_data() cur_4d_index += size return new_img_like(first_niimg, data, first_niimg.affine, copy_header=True)
else: selected_proteome = accs query_accs = random.sample(selected_proteome, size) fastas500 = list([records[el] for el in query_accs]) subject_accs = set(selected_proteome).difference(set(query_accs)) train_fastas = list([records[el] for el in subject_accs]) subsample100_accs = random.sample(query_accs, 100) subsample50_accs = random.sample(subsample100_accs, 50) fastas100 = list([records[el] for el in subsample100_accs]) fastas50 = list([records[el] for el in subsample50_accs]) fastas = [train_fastas, fastas500, fastas100, fastas50] print ("working", cnt) for pos, selection in enumerate(fastas): text = "" for s in selection: trigrams = list(zip(*(islice(seq, index, None) for index, seq in enumerate(tee(s, 3))))) trigram_protein = "|".join(["".join(el) for el in trigrams]) + "|" text += trigram_protein store.execute("INSERT INTO {} VALUES (?, ?)".format(tables[pos]), (id, text[0:-1])) map_benchmark[id] = {"test500":query_accs, "test100":subsample100_accs, "test50":subsample50_accs} conn.commit() cnt += 1 print(cnt) docs_above1000.append(id) print ("texts extracted") store.execute("CREATE INDEX taxon_idx_test500 ON texts_test500(taxon_id)") store.execute("CREATE INDEX taxon_idx_test100 ON texts_test100(taxon_id)") store.execute("CREATE INDEX taxon_idx_test50 ON texts_test50(taxon_id)") store.execute("CREATE INDEX taxon_idx_model ON texts_train(taxon_id)") conn.commit() pickle.dump(map_benchmark, open("benchmark_accs2id.p", "wb"))