Python tee Examples, itertools.tee Python Examples

Example #1

0

Show file

File: main.py Project: sshipsey/RegexCrosswordSolver

    def getVal(self, x, y):
        
        # Create a new generator for all possible string combinations at length
        verticalPerms = itertools.permutations(list(self.characterSet), len(self.vertClues))
        horizontalPerms = itertools.permutations(list(self.characterSet), len(self.horizClues))
        
        # Compile the regex for this square
        hre = re.compile('^' + self.horizClues[x] + '$')
        vre = re.compile('^' + self.vertClues[y] + '$')

        # Make a copy of the generators so we can iterate over them multiple times
        hp, hp_b = itertools.tee(horizontalPerms)
        vp, _ = itertools.tee(verticalPerms)     
                
        # For every possible combination of strings in vertical and horizontal perms
        for vstr in vp:
            hp, hp_b = itertools.tee(hp)
            for hstr in hp_b:
            
                # If we find an intersection of strings that match the regex, return the hstr[x] for that string as the answer
                if (list(hstr)[x] == list(vstr)[y] and hre.match("".join(list(hstr))) and vre.match("".join(list(vstr)))):
                    retVal = "Solution (" + str(x) + "," + str(y) + ") = " + str(list(hstr)[x])
                    return retVal
                    
        # This should never happen for a valid puzzle, no intersection found
        return "Solution (" + str(x) + "," + str(y) + ") Not found"

Example #2

0

Show file

File: test_itertools.py Project: Debug-Orz/Sypy

 def test_iterables(self):
     import itertools
 
     iterables = [
         itertools.chain(),
         itertools.count(),
         itertools.cycle([]),
         itertools.dropwhile(bool, []),
         itertools.groupby([]),
         itertools.ifilter(None, []),
         itertools.ifilterfalse(None, []),
         itertools.imap(None, []),
         itertools.islice([], 0),
         itertools.izip(),
         itertools.repeat(None),
         itertools.starmap(bool, []),
         itertools.takewhile(bool, []),
         itertools.tee([])[0],
         itertools.tee([])[1],
         ]
 
     for it in iterables:
         assert hasattr(it, '__iter__')
         assert iter(it) is it
         assert hasattr(it, 'next')
         assert callable(it.next)

Example #3

0

Show file

File: performance_evaluation.py Project: pkohvaei/RNAsynth

def performance_evaluation(params, synthesizer=None, iter_train=None, iter_test=None):
    """
    """
    shuffle_order = params['shuffle_order']
    negative_shuffle_ratio = params['negative_shuffle_ratio']
    vectorizer_complexity = params['vectorizer_complexity']

    # Copy training sample iterable for sequence synthesis and producing mixed sample set.
    iter_train, iter_train_syn, iter_seq_true = tee(iter_train, 3)

    # Copy test sample iterable used for evaluation.
    iter_test, iter_test_ = tee(iter_test)

    # Train TrueSamplesModel classifier. Evaluate.
    logger.info('Fit estimator on original data and evaluate the estimator.')
    roc_t, apr_t = fit_evaluate(iterable_train=iter_train, iterable_test=iter_test,
                                negative_shuffle_ratio=negative_shuffle_ratio,
                                shuffle_order=shuffle_order, vectorizer_complexity=vectorizer_complexity)

    # Produce synthesied sequences generator.
    iterable_seq_syn = synthesizer.fit_sample(iter_train_syn)

    # Mix synthesized and true samples.
    iterable_seq_mixed = chain(iterable_seq_syn, iter_seq_true)

    # Train MixedSamplesModel classifier. Evaluate.
    logger.info(
        'Fit estimator on original + sampled data and evaluate the estimator.')
    roc_s, apr_s = fit_evaluate(iterable_train=iterable_seq_mixed, iterable_test=iter_test_,
                                negative_shuffle_ratio=negative_shuffle_ratio,
                                shuffle_order=shuffle_order, vectorizer_complexity=vectorizer_complexity)

    return roc_t, apr_t, roc_s, apr_s

Example #4

0

Show file

File: update.py Project: AKSW/QuitStore

def evalDeleteWhere(ctx, u):
    """
    http://www.w3.org/TR/sparql11-update/#deleteWhere
    """

    res = {}
    res["type"] = "DELETEWHERE"
    res["delta"] = {}

    _res = evalBGP(ctx, u.triples)
    for g in u.quads:
        cg = ctx.dataset.get_context(g)
        c = ctx.pushGraph(cg)
        _res = _join(_res, list(evalBGP(c, u.quads[g])))

    for c in _res:
        g = ctx.graph
        filled, filled_delta = tee(_fillTemplate(u.triples, c))
        _append(res["delta"], 'default', 'removals', list(filled_delta))
        g -= filled

        for g in u.quads:
            cg = ctx.dataset.get_context(c.get(g))
            filledq, filledq_delta = tee(_fillTemplate(u.quads[g], c))
            _append(res["delta"], cg.identifier, 'removals', list(filledq_delta))
            cg -= filledq

    return res

Example #5

0

Show file

File: dialog_processor.py Project: EnayatUllah/neural-machine-translator

def get_processed_dialog_lines_and_index_to_token(corpus_path_en,corpus_path_de, processed_corpus_path_en,processed_corpus_path_de, token_index_path_en, token_index_path_de):
    _logger.info('Loading corpus data...')

    if os.path.isfile(processed_corpus_path_en) and os.path.isfile(token_index_path_en) and os.path.isfile(processed_corpus_path_de) and os.path.isfile(token_index_path_de):
        _logger.info(processed_corpus_path_en + ' and ' + token_index_path_en + 'and'+processed_corpus_path_de + ' and ' + token_index_path_de +' exist, loading files from disk')
        processed_dialog_lines_en = IterableSentences(processed_corpus_path_en)
        processed_dialog_lines_de = IterableSentences(processed_corpus_path_de)
        processed_dialog_lines_en = get_tokenized_dialog_lines_from_processed_corpus(processed_dialog_lines_en)
        processed_dialog_lines_de = get_tokenized_dialog_lines_from_processed_corpus(processed_dialog_lines_de)

        index_to_token_en = get_index_to_token(token_index_path_en)
        index_to_token_de = get_index_to_token(token_index_path_de)
        return processed_dialog_lines_en,processed_dialog_lines_de, index_to_token_en, index_to_token_de

    # continue here if processed corpus and token index are not stored on the disk
    _logger.info(processed_corpus_path_en + ' and ' + token_index_path_en + " don't exist, compute and save it")
    _logger.info(processed_corpus_path_de + ' and ' + token_index_path_de + " don't exist, compute and save it")
    processed_dialog_lines_en, processed_dialog_lines_de, index_to_token_en, index_to_token_de = process_corpus(corpus_path_en,corpus_path_de)
    
    processed_dialog_lines_en, processed_dialog_lines_for_save_en = tee(processed_dialog_lines_en)
    processed_dialog_lines_de, processed_dialog_lines_for_save_de = tee(processed_dialog_lines_de)

    save_index_to_tokens(index_to_token_en, token_index_path_en)
    save_index_to_tokens(index_to_token_de, token_index_path_de)

    save_corpus(processed_dialog_lines_for_save_en, processed_corpus_path_en)
    save_corpus(processed_dialog_lines_for_save_de, processed_corpus_path_de)

    return processed_dialog_lines_en,processed_dialog_lines_de, index_to_token_en, index_to_token_de

Example #6

0

Show file

File: HananPrim.py Project: codergeek42/MATH370_SemesterProject

	def __init__(self, startingVerts):
		self._graph = Graph()
		Xs = Ys = set()

		self._startingVerts = []
		self._stationWeightFunc = lambda g,v: 0
		self._startingVertCoords = startingPts = [(vert.X, vert.Y) for vert in startingVerts]
		for vert in startingVerts:
			self._startingVerts.append(self._graph.addVertex(Vertex(vert.X, vert.Y, vert.name)))
			Xs = Xs.union([vert.X])
			Ys = Ys.union([vert.Y])
		hananPts = product(*(zip(*startingPts)))
		for (hananID, (vertX, vertY)) in enumerate(set(hananPts).difference(startingPts)):
			self._graph.addVertex(Vertex(vertX, vertY, "h"+str(hananID)))

		## NB: Make sure these all are in-scope, as we'll need their end-of-loop values below.
		x = rightX = y = upY = thisVertID = rightVertID = upVertID = rightUpVertID = None

		Xs = sorted(Xs)
		Ys = sorted(Ys)
		XsCurr, XsRight = tee(Xs)
		next(XsRight, None)
		for (x, rightX) in zip(XsCurr, XsRight):
			YsCurr, YsUp = tee(Ys)
			next(YsUp, None)
			for (y, upY) in zip(YsCurr, YsUp):
				thisVertID = self._graph.getVertexID(x, y)
				rightVertID = self._graph.getVertexID(rightX, y)
				upVertID = self._graph.getVertexID(x, upY)
				self._graph.addEdgeWithVertsAndWeight(thisVertID, rightVertID, abs(rightX-x))
				self._graph.addEdgeWithVertsAndWeight(thisVertID, upVertID, abs(upY-y))
				rightUpVertID = self._graph.getVertexID(rightX, upY)
				self._graph.addEdgeWithVertsAndWeight(rightVertID, rightUpVertID, abs(upY-y))
			self._graph.addEdgeWithVertsAndWeight(upVertID, rightUpVertID, abs(rightX-x))

Example #7

0

Show file

File: itertools.py Project: FrobtheBuilder/vIRC

def window(iter, pre_size=1, post_size=1):
	"""
	Given an iterable, return a new iterable which yields triples of
	(pre, item, post), where pre and post are the items preceeding and
	following the item (or None if no such item is appropriate). pre
	and post will always be pre_size and post_size in length.

	>>> example = window(range(10), pre_size=2)
	>>> pre, item, post = next(example)
	>>> pre
	(None, None)
	>>> post
	(1,)
	>>> next(example)
	((None, 0), 1, (2,))
	>>> list(example)[-1]
	((7, 8), 9, (None,))
	"""
	pre_iter, iter = itertools.tee(iter)
	pre_iter = itertools.chain((None,) * pre_size, pre_iter)
	pre_iter = nwise(pre_iter, pre_size)
	post_iter, iter = itertools.tee(iter)
	post_iter = itertools.chain(post_iter, (None,) * post_size)
	post_iter = nwise(post_iter, post_size)
	next(post_iter, None)
	return six.moves.zip(pre_iter, iter, post_iter)

Example #8

0

Show file

File: histogram.py Project: pjanowski/Pawel_PhD_Scripts

    def grid(self):
        '''
        returns an array (1 or 2 dimensional) of the
        center-point of each bin.

        Useful for converting a histogram to a line or
        surface plot:

        1D histogram:
            x, y = hist.grid(), hist
            plot(x, y)

        2D histogram:
            x, y = hist.grid()
            z = hist
            surface_plot(x, y, z)
        '''
        if not self.dim() in [1, 2]:
            raise Exception('only 1D and 2D histograms can return a grid.')
        if self.dim() is 1:
            it1, it2 = itertools.tee(iter(self.edges[0]))
            next(it2)
            return numpy.array([0.5 * (i1 + i2) for i1, i2 in itertools.izip(it1, it2)])
        if self.dim() is 2:
            grid = [[],[]]
            for d in [0, 1]:
                it1, it2 = itertools.tee(iter(self.edges[d]))
                next(it2)
                for i1, i2 in itertools.izip(it1, it2):
                    grid[d] += [0.5 * (i1 + i2)]
            return numpy.meshgrid(grid[0], grid[1])

Example #9

0

Show file

File: _iterator.py Project: FRidh/streaming

def vdl(signal, times, delay, initial_value=0.0):
    """Variable delay line which delays `signal` at 'times' with 'delay'.

    :param signal: Signal to be delayed.
    :type signal: Iterator
    :param delay: Delay.
    :type delay: Iterator
    :param initial_value: Sample to yield before first actual sample is yielded due to initial delay.

    .. note:: Times and delay should have the same unit, e.g. both in samples or both in seconds.
    """
    dt0, delay = cytoolz.peek(delay)
    times, _times = itertools.tee(times)

    # Yield initial value before interpolation kicks in
    # Note that this method, using tee, buffers all samples that will be discarded.
    # Therefore, room for optimization!
    n = 0
    if initial_value is not None:
        while next(_times) < dt0:
            n += 1
            yield initial_value

    times1, times2 = itertools.tee(times)
    interpolated = interpolate_linear(map(operator.add, times2, delay), signal, times1)
    yield from cytoolz.drop(n, interpolated)  # FIXME: move drop before interpolation, saves memory

Example #10

0

Show file

File: train.py Project: EnayatUllah/neural-machine-translator

def learn():
    # preprocess the dialog and get index for its vocabulary
    # processed_dialog_lines, index_to_token = \
    #     get_processed_dialog_lines_and_index_to_token(CORPUS_PATH, PROCESSED_CORPUS_PATH, TOKEN_INDEX_PATH)

    processed_dialog_lines_en, processed_dialog_lines_de, index_to_token_en, index_to_token_de = \
        get_processed_dialog_lines_and_index_to_token(CORPUS_PATH_EN, CORPUS_PATH_DE, PROCESSED_CORPUS_PATH_EN, PROCESSED_CORPUS_PATH_DE, TOKEN_INDEX_PATH_EN, TOKEN_INDEX_PATH_DE)

    # dualize iterator
    # dialog_lines_for_w2v, dialog_lines_for_nn = tee(processed_dialog_lines)

    dialog_lines_for_w2v_en, dialog_lines_for_nn_en = tee(processed_dialog_lines_en)
    dialog_lines_for_w2v_de, dialog_lines_for_nn_de = tee(processed_dialog_lines_de)
    _logger.info('-----')

    # use gensim realisatino of word2vec instead of keras embeddings due to extra flexibility
    w2v_model_en = w2v.get_dialogs_model(W2V_PARAMS_EN, dialog_lines_for_w2v_en)
    w2v_model_de = w2v.get_dialogs_model(W2V_PARAMS_DE, dialog_lines_for_w2v_de)

    _logger.info('-----')

    nn_model = get_nn_model(token_dict_size=len(index_to_token_de))
    _logger.info('-----')

    train_model(nn_model, w2v_model_en, w2v_model_de, dialog_lines_for_nn_en,dialog_lines_for_nn_de, index_to_token_en, index_to_token_de)

Example #11

0

Show file

File: dialog_processor.py Project: EnayatUllah/neural-machine-translator

def process_corpus(corpus_path_en,corpus_path_de):
    iterable_dialog_lines_en = IterableSentences(corpus_path_en)
    iterable_dialog_lines_de = IterableSentences(corpus_path_de)

    tokenized_dialog_lines_en = get_tokenized_dialog_lines(iterable_dialog_lines_en)
    tokenized_dialog_lines_de = get_tokenized_dialog_lines(iterable_dialog_lines_de)

    tokenized_dialog_lines_for_voc_en, tokenized_dialog_lines_for_transform_en = tee(tokenized_dialog_lines_en)
    tokenized_dialog_lines_for_voc_de, tokenized_dialog_lines_for_transform_de = tee(tokenized_dialog_lines_de)

    tokens_voc_en = get_tokens_voc(tokenized_dialog_lines_for_voc_en)
    tokens_voc_de = get_tokens_voc(tokenized_dialog_lines_for_voc_de)

    transformed_dialog_lines_en = get_transformed_dialog_lines(tokenized_dialog_lines_for_transform_en, tokens_voc_en)
    transformed_dialog_lines_de = get_transformed_dialog_lines(tokenized_dialog_lines_for_transform_de, tokens_voc_de)

    # transformed_dialog_lines_en=list(transformed_dialog_lines_en)
    # transformed_dialog_lines_de=list(transformed_dialog_lines_de)


    # transformed_dialog_lines_de_sorted= [transformed_dialog_lines_de[i] for i in list(zip(*sorted(zip(transformed_dialog_lines_en, range(len(transformed_dialog_lines_en))), key=lambda x:len(x[0]))))[1]]
    # transformed_dialog_lines_en_sorted = sorted(transformed_dialog_lines_en, lambda x,y: 1 if len(x)>len(y) else -1 if len(x)<len(y) else 0)

    # for i in range(len(transformed_dialog_lines_de_sorted)):
    #     print "en sentence length: ",len(transformed_dialog_lines_de_sorted[i])
    #     print "de sentence length: ",len(transformed_dialog_lines_de_sorted[i])
    # _logger.info('Token voc size en = ' + str(len(tokens_voc_en)) + 'Token voc size en = ' + str(len(tokens_voc_de)))
    
    index_to_token_en = dict(enumerate(tokens_voc_en))
    index_to_token_de = dict(enumerate(tokens_voc_de))


    return transformed_dialog_lines_en,transformed_dialog_lines_de, index_to_token_en,index_to_token_de

Example #12

0

Show file

File: process_common.py Project: PhilHarnish/forge

def aggregate_prefixes(
    iterables: List[Iterable[str]],
    delimiter: str = ' ') -> Iterable[Tuple[str, Optional[list]]]:
  """Aggregate iterables into nested tuples with shared prefixes.

  The iterable streams will be tee'd and consumed like so:
  0 1 2 3 4 ...
  | | | |/|
  | | |/| |
  | |/| | |
  |/| | | |
  x x x x x ...
  """
  n_iterables = len(iterables)
  if n_iterables == 1:
    yield from iter_util.iter_alphabetical_prefixes(
        iterables, delimiter=delimiter)
    return
  carry, iterables[-1] = itertools.tee(iterables[-1])
  for i in reversed(range(1, n_iterables - 1)):
    carry, iterables[i] = itertools.tee(
        iter_util.ensure_prefix(iterables[i], carry, delimiter=delimiter))
  iterables[0] = iter_util.ensure_prefix(
      iterables[0], carry, delimiter=delimiter)
  yield from iter_util.iter_alphabetical_prefixes(
      iterables, delimiter=delimiter)

Example #13

0

Show file

File: core.py Project: Oge77/dedupe

def iunzip(iterable, internal_length): # pragma: no cover
    """Iunzip is the same as zip(*iter) but returns iterators, instead of 
    expand the iterator. Mostly used for large sequence"""

    _tmp, iterable = itertools.tee(iterable, 2)
    iters = itertools.tee(iterable, internal_length)
    return (map(operator.itemgetter(i), it) for i, it in enumerate(iters))

Example #14

0

Show file

File: core.py Project: karimbahgat/PyDraw

 def threewise(iterable):
     a,_ = itertools.tee(iterable)
     b,c = itertools.tee(_)
     next(b, None)
     next(c, None)
     next(c, None)
     return itertools.izip(a,b,c)

Example #15

0

Show file

File: kmer_entropy.py Project: AgResearch/prbdf

def summarise_distributions(distributions, options):

    measure = "frequency"
    if options["summary_type"] in ["zipfian","entropy"]:
        measure = "unsigned_information"

    kmer_intervals = Distribution.get_intervals(distributions, options["num_processes"])

    #print "summarising %s , %s across %s"%(measure, str(kmer_intervals), str(distributions))
    print "summarising %s , %d kmers across %s"%(measure, len(kmer_intervals), str(distributions))


    sample_measures = Distribution.get_projections(distributions, kmer_intervals, measure, False, options["num_processes"])
    zsample_measures = itertools.izip(*sample_measures)
    sample_name_iter = [tuple([os.path.splitext(os.path.basename(distribution))[0] for distribution in distributions])]
    zsample_measures = itertools.chain(sample_name_iter, zsample_measures)
    interval_name_iter = itertools.chain([("kmer_pattern")],kmer_intervals)
    
    outfile=open(options["output_filename"], "w")

    if options["summary_type"] in ["entropy", "frequency"]:
        zsample_measures_with_rownames = itertools.izip(interval_name_iter, zsample_measures)
        for interval_measure in zsample_measures_with_rownames:
            print >> outfile, "%s\t%s"%("%s"%interval_measure[0], string.join((str(item) for item in interval_measure[1]),"\t"))
        outfile.close()
    elif options["summary_type"] in ["ranks", "zipfian"]:
        # duplicate interval_name_iter - needed 3 times
        interval_name_iter_dup = itertools.tee(interval_name_iter, 3)

        # triplicate zsample_measures (0 used to get ranks; 1 used to output measures; 3 used to get distances)
        zsample_measures_dup = itertools.tee(zsample_measures,3)
        ranks = Distribution.get_rank_iter(zsample_measures_dup[0])

        # duplicate ranks (0 used to output; 1 used to get distances)
        ranks_dup = itertools.tee(ranks, 2)
        ranks_with_rownames = itertools.izip(interval_name_iter_dup[0], ranks_dup[0])

        # output ranks
        print >> outfile , "*** ranks *** :"
        for interval_rank in ranks_with_rownames:
            print >> outfile, "%s\t%s"%("%s"%interval_rank[0], string.join((str(item) for item in interval_rank[1]),"\t"))

        # output measures
        print >> outfile , "*** entropies *** :"
        zsample_measures_with_rownames = itertools.izip(interval_name_iter_dup[1], zsample_measures_dup[1])
        for interval_measure in zsample_measures_with_rownames:
            print >> outfile, "%s\t%s"%("%s"%interval_measure[0], string.join((str(item) for item in interval_measure[1]),"\t"))

        # get distances
        print >> outfile , "*** distances *** :"
        (distance_matrix, point_names_sorted) = Distribution.get_zipfian_distance_matrix(zsample_measures_dup[2], ranks_dup[1])
        Distribution.print_distance_matrix(distance_matrix, point_names_sorted, outfile)
    else:
        print "warning, unknown summary type %(summary_type)s, no summary available"%options
        
        
        outfile.close()

Example #16

0

Show file

File: _mixins.py Project: lcrees/knife

 def _median(iterable, s=sorted, d=truediv, int=int, count=count):
     i1, i2 = tee(s(iterable))
     e = d(count(i1) - 1, 2)
     p = int(e)
     if e % 2 == 0:
         yield slice(i2, p)
     else:
         i3, i4 = tee(i2)
         yield truediv(slice(i3, p) + slice(i4, p + 1), 2)

Example #17

0

Show file

File: mstat.py Project: mdcallag/mytools

def build_inputs(args, interval, loops, db_user, db_password, db_host,
                 db_name, db_retries, data_sources):
  scan_vmstat = None
  scan_iostat = None
  inputs = []
  devices = iostat_get_devices()
  parse_counters = { 'iostat' : 0, 'vmstat' : 0, 'my.status' : 0 }

  if data_sources:
    f = open(data_sources)
    args.extend([l[:-1] for l in f.xreadlines()])

  expanded_args = []

  for arg in ['timestamp', 'timer', 'counter']:
    parse_args(arg, parse_counters, expanded_args, devices)

  for dev in devices:
   for col in iostat_cols:
     parse_args('iostat.%s.%s' % (dev, col), parse_counters, expanded_args, devices)
     parse_args('rate.iostat.%s.%s' % (dev, col), parse_counters, expanded_args, devices)

  for col in vmstat_cols:
    parse_args('vmstat.%s' % col, parse_counters, expanded_args, devices)
    parse_args('rate.vmstat.%s' % col, parse_counters, expanded_args, devices)

  for col in get_my_cols(db_user, db_password, db_host, db_name):
    parse_args('my.status.%s' % col, parse_counters, expanded_args, devices)
    parse_args('rate.my.status.%s' % col, parse_counters, expanded_args, devices)

  for arg in args:
    parse_args(arg, parse_counters, expanded_args, devices)

  tee_vmstat, tee_iostat, tee_mystat = None, None, None

  if parse_counters['vmstat']:
    scan_vmstat = ScanFork('vmstat -n %d %d' % (interval, loops+1), 2)
    tee_vmstat = itertools.tee(scan_vmstat, parse_counters['vmstat'])

  if parse_counters['iostat']:
    scan_iostat = ScanFork('iostat -kx %d %d' % (interval, loops+1), 0)
    tee_iostat = itertools.tee(scan_iostat, parse_counters['iostat'])

  if parse_counters['my.status']:
    scan_mystat = ScanMysql(db_user, db_password, db_host, db_name,
                            'SHOW GLOBAL STATUS', db_retries, 'Foo 0')
    tee_mystat = itertools.tee(scan_mystat, parse_counters['my.status'])

  # print expanded_args

  source_counters = { 'iostat' : 0, 'vmstat' : 0, 'my.status' : 0 }
  for arg in expanded_args:
    make_data_inputs(arg, inputs, source_counters, interval, db_user,
                     db_password, db_host, db_name, db_retries,
                     tee_vmstat, tee_iostat, tee_mystat)

  return inputs

Example #18

0

Show file

File: test_itertools.py Project: charred/pypy

 def test_tee_bug1(self):
     import itertools
     a, b = itertools.tee('abcde')
     x = a.next()
     assert x == 'a'
     c, d = itertools.tee(a)
     x = c.next()
     assert x == 'b'
     x = d.next()
     assert x == 'b'

Example #19

0

Show file

File: test_itertools.py Project: Qointum/pypy

 def test_tee_bug1(self):
     import itertools
     a, b = itertools.tee('abcde')
     x = next(a)
     assert x == 'a'
     c, d = itertools.tee(a)
     x = next(c)
     assert x == 'b'
     x = next(d)
     assert x == 'b'

Example #20

0

Show file

File: __init__.py Project: Answeror/stream.py

def iunzip(iterable):
    """Iunzip is the same as zip(*iter) but returns iterators, instead of
    expand the iterator. Mostly used for large sequence

    See <https://gist.github.com/1063340>.
    """
    from operator import itemgetter
    _tmp, iterable = itertools.tee(iterable, 2)
    iters = itertools.tee(iterable, len(_tmp.next()))
    return (itermap(itemgetter(i), it) for i, it in enumerate(iters))

Example #21

0

Show file

File: KeyString.py Project: gamenet/redis-memory-analyzer

    def analyze(self, keys, total=0):
        """

        :param keys:
        :param progress:
        :return:
        """
        key_stat = {
            "headers": ["Match", "Count", "Useful", "Real", "Ratio", "Encoding", "Min", "Max", "Avg"],
            "data": [],
        }

        progress = tqdm(total=total, mininterval=1, desc="Processing keys", leave=False)

        for pattern, data in keys.items():
            used_bytes_iter, aligned_iter, encoding_iter = tee(
                progress_iterator((StringEntry(value=x["name"]) for x in data), progress), 3
            )

            total_elements = len(data)
            if total_elements == 0:
                continue

            aligned = sum(obj.aligned for obj in aligned_iter)
            used_bytes_generator = (obj.useful_bytes for obj in used_bytes_iter)
            useful_iter, min_iter, max_iter, mean_iter = tee(used_bytes_generator, 4)

            prefered_encoding = pref_encoding((obj.encoding for obj in encoding_iter), redis_encoding_id_to_str)
            min_value = min(min_iter)
            if total_elements < 2:
                avg = min_value
            else:
                avg = statistics.mean(mean_iter)

            used_user = sum(useful_iter)

            stat_entry = [
                pattern,
                total_elements,
                used_user,
                aligned,
                aligned / used_user,
                prefered_encoding,
                min_value,
                max(max_iter),
                avg,
            ]
            key_stat["data"].append(stat_entry)

        key_stat["data"].sort(key=lambda x: x[1], reverse=True)
        key_stat["data"].append(make_total_row(key_stat["data"], ["Total:", sum, sum, sum, 0, "", 0, 0, 0]))

        progress.close()

        return ["key stats", key_stat]

Example #22

0

Show file

File: failed_fringe.py Project: tacaswell/leidenfrost

def triple_wise_periodic(iterable):
    """s -> (s0,s1,s2), (s1,s2,s3), ..., (sn,s0,s1)

    modified from example in documentation
    """
    a, _b = tee(iterable)
    b, c = tee(cycle(_b))
    next(b, None)
    next(c, None)
    next(c, None)

    return zip(a, b, c)

Example #23

0

Show file

File: graphutils.py Project: NicolasEhrhardt/CS224W-Project

def generate_all_graphs(full_graph, detailed=False, min_year=2004, max_year=2014):
    graph = get_empty_graph()
    for curnodes, curedges, criterion in generate_all_generators(full_graph, min_year, max_year):
        # updating graph
        if not detailed:
            users, busin = add_nodes_and_edges(full_graph, graph, criterion, curnodes=curnodes, curedges=curedges)
            yield users, busin, graph
        else:
            curnodes, curnodes_b = tee(curnodes)
            curedges, curedges_b = tee(curedges)
            users, busin = add_nodes_and_edges(full_graph, graph, criterion, curnodes=curnodes, curedges=curedges)
            yield users, busin, graph, curnodes_b, curedges_b

Example #24

0

Show file

File: model.py Project: teresa-m/EDeN

 def _select_data_matrices(self, iterable_pos, iterable_neg,
                           n_active_learning_iterations=2,
                           size_positive=-1,
                           size_negative=100,
                           lower_bound_threshold_positive=-1,
                           upper_bound_threshold_positive=1,
                           lower_bound_threshold_negative=-1,
                           upper_bound_threshold_negative=1):
     # select the initial ids simply as the first occurrences
     if size_positive != -1:
         positive_ids = range(size_positive)
     if size_negative != -1:
         negative_ids = range(size_negative)
     # iterate: select instances according to current model and create novel
     # data matrix to fit the model in next round
     for i in range(n_active_learning_iterations):
         # make data matrix on selected instances
         # if this is the first iteration or we need to select positives
         if i == 0 or size_positive != -1:
             iterable_pos, iterable_pos_, iterable_pos__ = tee(iterable_pos, 3)
             if size_positive == -1:  # if we take all positives
                 data_matrix_pos = self._data_matrix(iterable_pos_, fit_vectorizer=self.fit_vectorizer)
             else:  # otherwise use selection
                 data_matrix_pos = self._data_matrix(selection_iterator(iterable_pos_, positive_ids),
                                                     fit_vectorizer=self.fit_vectorizer)
         # if this is the first iteration or we need to select negatives
         if i == 0 or size_negative != -1:
             iterable_neg, iterable_neg_, iterable_neg__ = tee(iterable_neg, 3)
             if size_negative == -1:  # if we take all negatives
                 data_matrix_neg = self._data_matrix(iterable_neg_, fit_vectorizer=False)
             else:  # otherwise use selection
                 data_matrix_neg = self._data_matrix(selection_iterator(iterable_neg_, negative_ids),
                                                     fit_vectorizer=False)
         # assemble data matrix
         data_matrix, y = self._assemble_data_matrix(data_matrix_pos, data_matrix_neg)
         # stop the fitting procedure at the last-1 iteration and return data_matrix,y
         if i == n_active_learning_iterations - 1:
             break
         # fit the estimator on selected instances
         self.estimator.fit(data_matrix, y)
         # use the trained estimator to select the next instances
         if size_positive != -1:
             positive_ids = self._bounded_selection(iterable_pos__,
                                                    size=size_positive,
                                                    lower_bound_threshold=lower_bound_threshold_positive,
                                                    upper_bound_threshold=upper_bound_threshold_positive)
         if size_negative != -1:
             negative_ids = self._bounded_selection(iterable_neg__,
                                                    size=size_negative,
                                                    lower_bound_threshold=lower_bound_threshold_negative,
                                                    upper_bound_threshold=upper_bound_threshold_negative)
     return data_matrix, y

Example #25

0

Show file

File: rnn.py Project: Guangzhan/edm2016

def build_nn_data(data, num_questions, use_correct=True, use_hints=False):
    """
    Build data ready for RNN input.

    :param DataFrame data: User interactions for all users in DataFrame format as
        returned by loading functions in this package.
    :param int num_questions: number of questions in the full dataset
    :param bool use_correct: If True, records responses (before compression) as a
        2 * num_questions one-hot vector where one dimension corresponds to correct
        and one dimension corresponds to incorrect. If False, records responses
        (before compression) as a num_questions one-hot vector where each dimension
        corresponds to having *answered* a question, whether correctly or incorrectly.
    :param bool use_hints: If True, records responses ternarily: Correct, Wrong with
        No Hints, and Used a Hint.
    :return: list of all users data ready for RNN input.
    :rtype: list[UserData]
    """
    all_users_data = []
    data.sort([USER_IDX_KEY, TIME_IDX_KEY], inplace=True)

    # use_hints => use_correct
    use_correct = use_correct or use_hints

    for user_id, user in data.groupby(USER_IDX_KEY):

        x = []  # Input X denoting position for one hot
        y = []  # Mask Y to mask the probabilities all questions except the next one
        t = []  # The truth about the correctness of the next question

        xiter, yiter = its.tee(user[ITEM_IDX_KEY].values)
        next(yiter, None)
        this_correct_iter, next_correct_iter = its.tee(user[CORRECT_KEY].values)
        next(next_correct_iter, None)
        if use_hints:
            hints_iter = user[HINT_COUNT_KEY].values
        else:
            hints_iter = its.cycle([0])
        for this_skill, next_skill, this_correct, next_correct, hint in its.izip(
                xiter, yiter, this_correct_iter, next_correct_iter, hints_iter):
            # The first num_questions dimensions refer to incorrect responses, the
            # second num_questions dimensions to correct responses. *Unless*
            # use_correct is False, in which case, only num_questions dimensions
            # are used, one for answering (correctly or incorrectly) each question
            x.append(this_skill + num_questions * this_correct * (hint == 0) * use_correct +
                     2 * num_questions * (hint > 0) * use_hints)
            y.append(next_skill)
            t.append(next_correct)

        # Append it to a list
        all_users_data.append(UserData(length=len(x), history=x, next_answer=y, truth=t))

    return all_users_data

Example #26

0

Show file

File: mergesort.py Project: reem/sorts

def test():
    size = 10
    randomiterator, backup = itertools.tee(rlist(size, 0, size))
    sortediterator = iterator_mergesort(randomiterator, size)
    try:
        assert list(sortediterator) == sorted(backup)
    except AssertionError as e:
        print "Test failed."
        randomlist, randomlist1 = itertools.tee(rlist(10,0,10))
        print list(iterator_mergesort(randomlist, 10))
        print sorted(randomlist1)
        sys.exit(0)
    print "All tests pass."

Example #27

0

Show file

def pairwise(iterable):
    "s -> (s0,s1), (s1,s2), (s2,s3), ..."
    split1, split2 = itertools.tee(iterable)
    a, b = itertools.tee(split1)
    test1, test2 = itertools.tee(split2)
    next(b, None)
    if next(test1, None) is None:
        ret = py23_zip(a, b)  # Returns empty list
    elif next(test2, None) is not None and next(test2, None) is None:
        ret = py23_zip(a, [None])  # Return at least one value
    else:
        ret = py23_zip(a, b)
    del test1, test2, split2
    return ret

Example #28

0

Show file

File: qsort.py Project: daineX/Nim

def iter_qsort(it):
    it, peek = tee(it)
    try:
        next(peek), next(peek)
    except StopIteration:
        yield next(it)
    else:
        pivot = next(it)
        lesser, greater = tee(it)
        for n in iter_qsort(n for n in lesser if n <= pivot):
            yield n
        yield pivot
        for n in iter_qsort(n for n in greater if n > pivot):
            yield n

Example #29

0

Show file

File: test_itertools.py Project: Debug-Orz/Sypy

    def test_tee_optimization(self):
        import itertools

        a, b = itertools.tee(iter('foobar'))
        c, d = itertools.tee(b)
        assert c is b
        assert a is not c
        assert a is not d
        assert c is not d
        res = list(a)
        assert res == list('foobar')
        res = list(c)
        assert res == list('foobar')
        res = list(d)
        assert res == list('foobar')

Example #30

0

Show file

File: formal-power-series-2.py Project: Anatolt/RosettaCodeData

def sine_cosine_series():
    def deferred_sin():
        for i in sinx_temp:
            yield i
    def deferred_cos():
        for i in cosx_temp:
            yield i

    sinx_result, sinx_copy1 = tee(deferred_sin(), 2)
    cosx_result, cosx_copy1 = tee(deferred_cos(), 2)

    sinx_temp = intgpower(cosx_copy1)
    cosx_temp = minuspower(constpower(1), intgpower(sinx_copy1))

    return sinx_result, cosx_result

Example #31

0

Show file

    def __init__(self, network_params, weight_params, stdp_params, total_time, DoG_params=None,
                 spike_times_learn=None, spike_times_train=None, spike_times_test=None,
                 y_train=None, y_test=None, device='GPU', tau=5):
        """
            Initialisaition of SDNN

            Input:            
            - network_params: A list of dictionaries with the following keys:                
                -'Type': A string specifying which kind of layer this is (either 'input', 'conv' and 'pool')
                -'num_filters': an int specifying the depth (number of filters) of this layer
                -'filter_size': an int specifying the height and width of the filter window for 
                                the previous layer to this layer (only on 'conv' and  'pool')
                -'th': an np.float32 specifying the threshold of this layer (only on 'conv' and  'pool')
                -'stride': an int specifying the stride for this layer (only on 'pool')
                -'pad': an int specifying the pad for this layer (only on 'input')
                -'H_layer': an int specifying the height of this layer (only on 'input')
                -'W_layer': an int specifying the width of this layer (only on 'input') 
            - weight_params: A dictionary with the following keys:                
                - 'mean': the mean for initialising the weights
                - 'std': the std for initialising the weights
            - stdp_params: A dictionary with the following keys:                                
                - 'max_iter': an int specifyng the maximum number of iterations allowed on learning
                - 'max_learn_iter': a list of ints specifying the maximum number of iterations allowed for training each layer (len = number of layers)
                - 'stdp_per_layer': a list of ints specifying the maximum number of STDP updates per layer (len = number of layers)
                - 'offset_STDP': a list of ints specifying the STDP ofset per leayer updates per layer (len = number of layers)
                - 'a_minus': an np.float32 numpy array specifying the learning rate when no causality 
                - 'a_plus': an np.float32 numpy array specifying the learning rate when there is causality 
            - total_time: An int specifying the number of time steps per image
            - spike_times_learn: A list of strings with a valid absolute or relative path to the folders with 
                                 the learning .jpg images OR 
                                 An uint8 array with the learning spike times of shape (N_lr, H_in, W_in, M_in). 
                                 Axis 0 is each of the images
            - spike_times_train: A list of strings with a valid absolute or relative path to the folders with 
                                 the training .jpg images OR 
                                 An uint8 array with the training spike times of shape (N_tr, H_in, W_in, M_in). 
                                 Axis 0 is each of the images
            - spike_times_test: A list of strings with a valid absolute or relative path to the folders with 
                                 the testing .jpg images OR 
                                 An uint8 array with the testing spike times of shape (N_ts, H_in, W_in, M_in). 
                                 Axis 0 is each of the images   
            - DoG_params: None OR A dictionary with the following keys:
                -'img_size': A tuple of integers with the dimensions to which the images are to be resized 
                -'DoG_size': An int with the size of the DoG filter window size
                -'std1': A float with the standard deviation 1 for the DoG filter
                -'std2': A float with the standard deviation 2 for the DoG filter                  
            - tau: For STDP window
        """

        # --------------------------- DoG Filter Parameters -------------------#
        if DoG_params is not None:
            self.DoG = True
            self.img_size = DoG_params['img_size']
            self.filt = DoG(DoG_params['DoG_size'], DoG_params['std1'], DoG_params['std2'])
        else:
            self.DoG = False

        # --------------------------- Network Initialisation -------------------#
        # Total time and number of layers
        self.num_layers = len(network_params)
        self.learnable_layers = []
        self.total_time = total_time

        # Layers Initialisation
        self.network_struc = []
        self.init_net_struc(network_params)
        self.layers = []
        self.init_layers()

        # Weights Initialisation
        self.weight_params = weight_params
        self.weights = []
        self.init_weights()

        # Dimension Check
        self.check_dimensions()

        # ---------------------------Learning Paramters -------------------#
        # Learning layer parameters
        self.max_iter = stdp_params['max_iter']
        self.learning_layer = self.learnable_layers[0]
        self.max_learn_iter = stdp_params['max_learn_iter']
        self.curr_lay_idx = 0
        self.counter = 0
        self.curr_img = 0

        #STDP params
        self.stdp_per_layer = stdp_params['stdp_per_layer']
        self.stdp_a_minus = stdp_params['a_minus']
        self.stdp_a_plus = stdp_params['a_plus']
        self.offsetSTDP = stdp_params['offset_STDP']

        # --------------------------- CUDA Parameters -------------------#
        self.device = device
        if self.device == 'GPU':
            self.thds_per_dim = 8  # (Use 8 if doesn't work)

        # --------------------------- Input spike times -------------------#
        # Generate Iterators with the full path to the images in each set OR reference the spike times
        if self.DoG:
            self.spike_times_learn, self.y_learn = self.gen_iter_paths(spike_times_learn)
            self.spike_times_train, self.y_train = self.gen_iter_paths(spike_times_train)
            self.spike_times_test, self.y_test = self.gen_iter_paths(spike_times_test)
            self.num_img_learn = self.y_learn.size
            self.num_img_train = self.y_train.size
            self.num_img_test = self.y_test.size
            self.spike_times_train, self.learn_buffer = tee(self.spike_times_train)
        else:
            self.spike_times_learn = spike_times_learn
            self.num_img_learn = spike_times_learn.shape[0]
            self.spike_times_train = spike_times_train
            self.num_img_train = spike_times_train.shape[0]
            self.spike_times_test = spike_times_test
            self.num_img_test = spike_times_test.shape[0]
            self.y_train = y_train
            self.y_test = y_test

        # --------------------------- Output features -------------------#
        self.features_train = []
        self.features_test = []

        self.tau = tau

Example #32

0

Show file

def eachCons(iterable):
    a, b = tee(iterable)
    next(b, None)
    return izip(a, b)

Example #33

0

Show file

File: __init__.py Project: sumanthha/fundafriend

 def _get_choices(self):
     if hasattr(self._choices, 'next'):
         choices, self._choices = tee(self._choices)
         return choices
     else:
         return self._choices

Example #34

0

Show file

File: gsea.py Project: jamesdj/gsea

def consecutive_pairs(iterable):
    "s -> (s0,s1), (s1,s2), (s2, s3), ..."
    a, b = itertools.tee(iterable)
    next(b, None)
    return zip(a, b)

Example #35

0

Show file

File: language.py Project: venlyn/spaCy

    def pipe(
        self,
        texts,
        as_tuples=False,
        n_threads=-1,
        batch_size=1000,
        disable=[],
        cleanup=False,
        component_cfg=None,
        n_process=1,
    ):
        """Process texts as a stream, and yield `Doc` objects in order.

        texts (iterator): A sequence of texts to process.
        as_tuples (bool): If set to True, inputs should be a sequence of
            (text, context) tuples. Output will then be a sequence of
            (doc, context) tuples. Defaults to False.
        batch_size (int): The number of texts to buffer.
        disable (list): Names of the pipeline components to disable.
        cleanup (bool): If True, unneeded strings are freed to control memory
            use. Experimental.
        component_cfg (dict): An optional dictionary with extra keyword
            arguments for specific components.
        n_process (int): Number of processors to process texts, only supported
            in Python3. If -1, set `multiprocessing.cpu_count()`.
        YIELDS (Doc): Documents in the order of the original text.

        DOCS: https://spacy.io/api/language#pipe
        """
        if is_python2 and n_process != 1:
            user_warning(Warnings.W023)
            n_process = 1
        if n_threads != -1:
            deprecation_warning(Warnings.W016)
        if n_process == -1:
            n_process = mp.cpu_count()
        if as_tuples:
            text_context1, text_context2 = itertools.tee(texts)
            texts = (tc[0] for tc in text_context1)
            contexts = (tc[1] for tc in text_context2)
            docs = self.pipe(
                texts,
                batch_size=batch_size,
                disable=disable,
                n_process=n_process,
                component_cfg=component_cfg,
            )
            for doc, context in izip(docs, contexts):
                yield (doc, context)
            return
        if component_cfg is None:
            component_cfg = {}

        pipes = (
            []
        )  # contains functools.partial objects to easily create multiprocess worker.
        for name, proc in self.pipeline:
            if name in disable:
                continue
            kwargs = component_cfg.get(name, {})
            # Allow component_cfg to overwrite the top-level kwargs.
            kwargs.setdefault("batch_size", batch_size)
            if hasattr(proc, "pipe"):
                f = functools.partial(proc.pipe, **kwargs)
            else:
                # Apply the function, but yield the doc
                f = functools.partial(_pipe, proc=proc, kwargs=kwargs)
            pipes.append(f)

        if n_process != 1:
            docs = self._multiprocessing_pipe(texts, pipes, n_process,
                                              batch_size)
        else:
            # if n_process == 1, no processes are forked.
            docs = (self.make_doc(text) for text in texts)
            for pipe in pipes:
                docs = pipe(docs)

        # Track weakrefs of "recent" documents, so that we can see when they
        # expire from memory. When they do, we know we don't need old strings.
        # This way, we avoid maintaining an unbounded growth in string entries
        # in the string store.
        recent_refs = weakref.WeakSet()
        old_refs = weakref.WeakSet()
        # Keep track of the original string data, so that if we flush old strings,
        # we can recover the original ones. However, we only want to do this if we're
        # really adding strings, to save up-front costs.
        original_strings_data = None
        nr_seen = 0
        for doc in docs:
            yield doc
            if cleanup:
                recent_refs.add(doc)
                if nr_seen < 10000:
                    old_refs.add(doc)
                    nr_seen += 1
                elif len(old_refs) == 0:
                    old_refs, recent_refs = recent_refs, old_refs
                    if original_strings_data is None:
                        original_strings_data = list(self.vocab.strings)
                    else:
                        keys, strings = self.vocab.strings._cleanup_stale_strings(
                            original_strings_data)
                        self.vocab._reset_cache(keys, strings)
                        self.tokenizer._reset_cache(keys)
                    nr_seen = 0

Example #36

0

Show file

File: Better way36.py Project: Dev9er/PythonStudy

import itertools

it = itertools.chain([1, 2, 3], [4, 5, 6])
print(list(it))

#
it = itertools.repeat('안녕', 3)
print(list(it))

#
it = itertools.cycle([1, 2])
result = [next(it) for _ in range(10)]
print(result)

#
it1, it2, it3 = itertools.tee(['하나', '둘'], 3)
print(list(it1))
print(list(it2))
print(list(it3))

#
keys = ['하나', '둘', '셋']
values = [1, 2]

normal = list(zip(keys, values))
print('zip:', normal)

it = itertools.zip_longest(keys, values, fillvalue='없음')
longest = list(it)
print('zip_longest:', longest)

Example #37

0

Show file

def main(filename):
    with open(filename) as boarding_passes:
        data_for_question_1, data_for_question_2 = itertools.tee(
            boarding_passes)
        print(highest_seat_id(of_all_the_ids_from(data_for_question_1)))
        print(our_seat(of_all_the_ids_from(data_for_question_2)))

Example #38

0

Show file

File: request.py Project: sohoffice/tavern

 def partition(pred, iterable):
     """From itertools documentation"""
     t1, t2 = tee(iterable)
     return list(filterfalse(pred, t1)), list(filter(pred, t2))

Example #39

0

Show file

File: test_resources.py Project: xinjiyier/setuptools

def pairwise(iterable):
    "s -> (s0,s1), (s1,s2), (s2, s3), ..."
    a, b = itertools.tee(iterable)
    next(b, None)
    return zip(a, b)

Example #40

0

Show file

def prev_this_next(it):
    a, b, c = itertools.tee(it,3)
    next(c)
    return zip(itertools.chain([None], a), b, itertools.chain(c, [None]))

Example #41

0

Show file

def _pairwise(iterable):
    """s -> (s0,s1), (s1,s2), (s2, s3), ..."""
    a, b = tee(iterable)
    next(b, None)
    return zip(a, b)

Example #42

0

Show file

File: paginate.py Project: joshowen/botocore

 def result_key_iters(self):
     teed_results = tee(self, len(self.result_keys))
     return [ResultKeyIterator(i, result_key) for i, result_key
             in zip(teed_results, self.result_keys)]

Example #43

0

Show file

File: utils.py Project: amey-git/py-net

def lookahead(it):
    it1, it2 = tee(iter(it))
    next(it2)
    return zip_longest(it1, it2)

Example #44

0

Show file

File: core.py Project: smendes/HTTPretty

    def fill_filekind(self, fk):
        now = datetime.utcnow()

        headers = {
            'status': self.status,
            'date': now.strftime('%a, %d %b %Y %H:%M:%S GMT'),
            'server': 'Python/HTTPretty',
            'connection': 'close',
        }

        if self.forcing_headers:
            headers = self.forcing_headers

        if self.adding_headers:
            headers.update(self.normalize_headers(self.adding_headers))

        headers = self.normalize_headers(headers)
        status = headers.get('status', self.status)
        if self.body_is_callable:
            status, headers, self.body = self.callable_body(
                self.request, self.info.full_url(), headers)
            headers = self.normalize_headers(headers)
            # TODO: document this behavior:
            if 'content-length' not in headers:
                headers.update({'content-length': len(self.body)})

        string_list = [
            'HTTP/1.1 %d %s' % (status, STATUSES[status]),
        ]

        if 'date' in headers:
            string_list.append('date: %s' % headers.pop('date'))

        if not self.forcing_headers:
            content_type = headers.pop('content-type',
                                       'text/plain; charset=utf-8')

            content_length = headers.pop('content-length', self.body_length)

            string_list.append('content-type: %s' % content_type)
            if not self.streaming:
                string_list.append('content-length: %s' % content_length)

            server = headers.pop('server', None)
            if server:
                string_list.append('server: %s' % server)

        for k, v in headers.items():
            string_list.append('{0}: {1}'.format(k, v), )

        for item in string_list:
            fk.write(utf8(item) + b'\n')

        fk.write(b'\r\n')

        if self.streaming:
            self.body, body = itertools.tee(self.body)
            for chunk in body:
                fk.write(utf8(chunk))
        else:
            fk.write(utf8(self.body))

        fk.seek(0)

Example #45

0

Show file

File: solution12.py Project: shaharlinial/hmm-pos-tagging

 def pairwise(iterable):
     # generates tuples of words in order, i.e Sentence = w1,w2,w3,...wn --> pairwise(sentence) = (w1,w2),(w2,w3),(w3,w4),...
     a, b = itertools.tee(iterable)
     next(b, None)
     return zip(a, b)

Example #46

0

Show file

File: ceph_service.py Project: zhenshuitieniu/ceph

 def pairwise(iterable):
     from itertools import tee
     a, b = tee(iterable)
     next(b, None)
     return zip(a, b)

Example #47

0

Show file

File: dpplee3_model.py Project: uestcliming/Dpplee3

    def train(self, data_iterator):
        '''
        Train a pytorch model on a worker and send asynchronous updates
        to parameter server
        '''
        print(self.master_url)
        print(self.optimizer_config)
        data_all, target_all = tee(data_iterator, 2)
        x_train = np.asarray([x for x, y in data_all])
        y_train = np.asarray([y for x, y in target_all])
        # print(self.frequency)
        # print('-------worker open----')
        # f = open('model.pkl', 'wb')
        # print(len(self.serialized_network))
        # f.write(self.serialized_network)
        # f.close()
        # print('-----close f')
        # print(self.serialized_network.state_dict())
        # print('`````````'

        if x_train.size == 0:
            return
        # print('picke load model')
        # model = pickle.loads(self.serialized_network)
        # print('picke load model hhh')
        # model = torch.load('model.pkl')
        # model = nn.Sequential(OrderedDict([
        #   ('conv1', nn.Conv2d(1, 10, kernel_size=(5, 5), stride=(1, 1))),
        #   ('conv2', nn.Conv2d(10, 20, kernel_size=(5, 5), stride=(1, 1))),
        #   ('conv2_drop', nn.Dropout2d(p=0.5)),
        #   ('fc1', nn.Linear(in_features=320, out_features=50, bias=True)),
        #   ('fc2', nn.Linear(in_features=50, out_features=10, bias=True))

        # ]))
        model = self.network
        epoch_num = self.train_config['epoch']
        batch_size = self.train_config['batch_size']
        sample_num = x_train.shape[0]
        batch_num = int(np.ceil(sample_num / batch_size)) - 5

        use_gpu = torch.cuda.is_available()
        if use_gpu:
            model.cuda()
        '''grained of updates, frequency_num controls more concise grain of asyn training, leave for future work.'''
        cnt = 0
        if self.frequency == 'epoch':
            for epoch in range(epoch_num):
                state_dict_before_training = self.get_post.get_server_state_dict(
                )
                print('get_server_state_dict')
                # print(state_dict_before_training)
                model.load_state_dict(state_dict_before_training)
                optimizer = get_optimizer(self.worker_optimizer,
                                          self.optimizer_config,
                                          model.parameters())
                model.train()
                for idx in range(batch_num):
                    data = x_train[idx *
                                   batch_size:min((idx + 1) *
                                                  batch_size, sample_num)]
                    target = y_train[idx *
                                     batch_size:min((idx + 1) *
                                                    batch_size, sample_num)]
                    # print(target)
                    # print(type(target))
                    if use_gpu:
                        data = torch.from_numpy(data).cuda()
                        target = torch.from_numpy(target).cuda()

                    data = Variable(torch.from_numpy(data))
                    target = Variable(torch.from_numpy(target))
                    # print(data.size())
                    # print(target.size())
                    optimizer.zero_grad()
                    # print(optimizer)
                    output = model(data)
                    # print(output)
                    # print(target)
                    loss = get_loss(self.loss_function, output, target,
                                    use_gpu)
                    # loss = F.nll_loss(output, target)
                    # print(idx, '     ',loss)
                    loss.backward()
                    optimizer.step()
                    # optimizer.zero_grad()

                cnt = cnt + 1
                if cnt == self.frequency_num:
                    eval_output = model(data)
                    eval_loss = get_loss(self.loss_function, eval_output,
                                         target, use_gpu)
                    print(epoch, '------------', eval_loss)
                    state_dict_after_training = model.state_dict()
                    # print(state_dict_after_training, 'AAAAAAAAAAAAAAAAAAAAAAAAA')
                    updates = compute_updates(state_dict_before_training,
                                              state_dict_after_training)
                    # print(updates, 'update delta to parameter server~~')
                    self.get_post.post_updates_to_server(updates)
                    cnt = 0

        elif self.frequency == 'batch':
            for epoch in range(epoch_num):
                for idx in range(batch_num):
                    state_dict_before_training = self.get_post.get_server_state_dict(
                    )
                    model.load_state_dict(state_dict_before_training)
                    optimizer = get_optimizer(self.worker_optimizer,
                                              self.optimizer_config,
                                              model.parameters())
                    model.train()
                    data = x_train[idx *
                                   batch_size:min((idx + 1) *
                                                  batch_size, sample_num)]
                    target = y_train[idx *
                                     batch_size:min((idx + 1) *
                                                    batch_size, sample_num)]
                    if use_gpu:
                        data = torch.from_numpy(data).cuda()
                        target = torch.from_numpy(target).cuda()
                    data = Variable(torch.Tensor(data))
                    target = Variable(torch.Tensor(target))
                    optimizer.zero_grad()
                    output = model(data)
                    loss = get_loss(self.loss_function, output, target,
                                    use_gpu)
                    loss.backward()
                    optimizer.step()
                    cnt = cnt + 1
                    if cnt == self.frequency_num:
                        eval_output = model(data)
                        eval_loss = get_loss(self.loss_function, eval_output,
                                             target, use_gpu)
                        print(epoch, '------------', eval_loss)
                        state_dict_after_training = model.state_dict()
                        updates = compute_updates(state_dict_before_training,
                                                  state_dict_after_training)
                        self.get_post.post_updates_to_server(updates)
                        cnt = 0
        else:
            print('please choose the frequency of training')

        yield []

Example #48

0

Show file

File: group_sparse_cov.py Project: Raniac/NEURO-LEARN

    def fit(self, subjects, y=None):
        """Compute cross-validated group-sparse precisions.

        Parameters
        ----------
        subjects : list of numpy.ndarray with shapes (n_samples, n_features)
            input subjects. Each subject is a 2D array, whose columns contain
            signals. Sample number can vary from subject to subject, but all
            subjects must have the same number of features (i.e. of columns.)

        Returns
        -------
        self: GroupSparseCovarianceCV
            the object instance itself.
        """
        # Empirical covariances
        emp_covs, n_samples = \
                  empirical_covariances(subjects, assume_centered=False)
        n_subjects = emp_covs.shape[2]

        # One cv generator per subject must be created, because each subject
        # can have a different number of samples from the others.
        cv = []
        for k in range(n_subjects):
            cv.append(check_cv(
                    self.cv, np.ones(subjects[k].shape[0]),
                    classifier=False
                    ).split(subjects[k])
                      )
        path = list()  # List of (alpha, scores, covs)
        n_alphas = self.alphas

        if isinstance(n_alphas, collections.Sequence):
            alphas = list(self.alphas)
            n_alphas = len(alphas)
            n_refinements = 1
        else:
            n_refinements = self.n_refinements
            alpha_1, _ = compute_alpha_max(emp_covs, n_samples)
            alpha_0 = 1e-2 * alpha_1
            alphas = np.logspace(np.log10(alpha_0), np.log10(alpha_1),
                                 n_alphas)[::-1]

        covs_init = itertools.repeat(None)

        # Copying the cv generators to use them n_refinements times.
        cv_ = izip(*cv)

        for i, (this_cv) in enumerate(itertools.tee(cv_, n_refinements)):
            # Compute the cross-validated loss on the current grid
            train_test_subjs = []
            for train_test in this_cv:
                assert(len(train_test) == n_subjects)
                train_test_subjs.append(list(zip(*[(subject[train, :],
                                                    subject[test, :])
                                             for subject, (train, test)
                                             in zip(subjects, train_test)])))
            if self.early_stopping:
                probes = [EarlyStopProbe(test_subjs,
                                         verbose=max(0, self.verbose - 1))
                          for _, test_subjs in train_test_subjs]
            else:
                probes = itertools.repeat(None)

            this_path = Parallel(n_jobs=self.n_jobs,
                                 verbose=self.verbose)(
                delayed(group_sparse_covariance_path)(
                    train_subjs, alphas, test_subjs=test_subjs,
                    max_iter=self.max_iter_cv, tol=self.tol_cv,
                    verbose=max(0, self.verbose - 1), debug=self.debug,
                    # Warm restart is useless with early stopping.
                    precisions_init=None if self.early_stopping else prec_init,
                    probe_function=probe)
                for (train_subjs, test_subjs), prec_init, probe
                in zip(train_test_subjs, covs_init, probes))

            # this_path[i] is a tuple (precisions_list, scores)
            # - scores: scores obtained with the i-th folding, for each value
            #   of alpha.
            # - precisions_list: corresponding precisions matrices, for each
            #   value of alpha.
            precisions_list, scores = list(zip(*this_path))
            # now scores[i][j] is the score for the i-th folding, j-th value of
            # alpha (analoguous for precisions_list)
            precisions_list = list(zip(*precisions_list))
            scores = [np.mean(sc) for sc in zip(*scores)]
            # scores[i] is the mean score obtained for the i-th value of alpha.

            path.extend(list(zip(alphas, scores, precisions_list)))
            path = sorted(path, key=operator.itemgetter(0), reverse=True)

            # Find the maximum score (avoid using the built-in 'max' function
            # to have a fully-reproducible selection of the smallest alpha in
            # case of equality)
            best_score = -np.inf
            last_finite_idx = 0
            for index, (alpha, this_score, _) in enumerate(path):
                if this_score >= .1 / np.finfo(np.float).eps:
                    this_score = np.nan
                if np.isfinite(this_score):
                    last_finite_idx = index
                if this_score >= best_score:
                    best_score = this_score
                    best_index = index

            # Refine the grid
            if best_index == 0:
                # We do not need to go back: we have chosen
                # the highest value of alpha for which there are
                # non-zero coefficients
                alpha_1 = path[0][0]
                alpha_0 = path[1][0]
                covs_init = path[0][2]
            elif (best_index == last_finite_idx
                    and not best_index == len(path) - 1):
                # We have non-converged models on the upper bound of the
                # grid, we need to refine the grid there
                alpha_1 = path[best_index][0]
                alpha_0 = path[best_index + 1][0]
                covs_init = path[best_index][2]
            elif best_index == len(path) - 1:
                alpha_1 = path[best_index][0]
                alpha_0 = 0.01 * path[best_index][0]
                covs_init = path[best_index][2]
            else:
                alpha_1 = path[best_index - 1][0]
                alpha_0 = path[best_index + 1][0]
                covs_init = path[best_index - 1][2]
            alphas = np.logspace(np.log10(alpha_1), np.log10(alpha_0),
                                 len(alphas) + 2)
            alphas = alphas[1:-1]
            if n_refinements > 1:
                logger.log("[GroupSparseCovarianceCV] Done refinement "
                           "% 2i out of %i" % (i + 1, n_refinements),
                           verbose=self.verbose)

        path = list(zip(*path))
        cv_scores_ = list(path[1])
        alphas = list(path[0])

        self.cv_scores_ = np.array(cv_scores_)
        self.alpha_ = alphas[best_index]
        self.cv_alphas_ = alphas

        # Finally, fit the model with the selected alpha
        logger.log("Final optimization", verbose=self.verbose)
        self.covariances_ = emp_covs
        self.precisions_ = _group_sparse_covariance(
            emp_covs, n_samples, self.alpha_, tol=self.tol,
            max_iter=self.max_iter,
            verbose=max(0, self.verbose - 1), debug=self.debug)
        return self

Example #49

0

Show file

from itertools import chain, tee

# chaining is used when iterating throu multiple iterables/iterators
# teeing is used hen we need to iteratoe over an iterator multiple times

l1 = (i**2 for i in range(4))
l2 = (i**2 for i in range(4, 8))
l3 = (i**2 for i in range(8, 12))

# for gen in l1, l2, l3:
#     for item in gen:
#         print(item)

l = [l1, l2, l3]

for item in chain(*l):
    print(item)


def squares(n):
    for i in range(n):
        yield i**2


s = squares(10)

print(list(s))

print(s)
iters = tee(s, 5)
print(list(iters))

Example #50

0

Show file

def pairwise(iterable):
    # from https://docs.python.org/3/library/itertools.html
    "s -> (s0,s1), (s1,s2), (s2, s3), ..."
    a, b = tee(iterable)
    next(b, None)
    return zip(a, b)

Example #51

0

Show file

File: minmax.py Project: jorgearanda/morsels

def minmax(seq, *, key=lambda x: x):
    """Return the minimum and maximum values in the input."""
    iterator1, iterator2 = tee(seq)

    return MinMax(min(iterator1, key=key), max(iterator2, key=key))

Example #52

0

Show file

    def fill_filekind(self, fk, sent_data):
        now = datetime.utcnow()

        headers = {
            'status': self.status,
            'date': now.strftime('%a, %d %b %Y %H:%M:%S GMT'),
            'server': 'Python/HTTPretty',
            'connection': 'close',
        }

        if callable(self.body):
            _resp = self.body(sent_data)
            if not isinstance(_resp, str) and len(_resp) == 2:
                body = _resp[0]
                headers.update(_resp[1])
            else:
                body = _resp
            self.body_length = len(body)
        else:
            body = self.body

        if self.forcing_headers:
            headers = self.forcing_headers

        if self.adding_headers:
            headers.update(self.normalize_headers(self.adding_headers))

        headers = self.normalize_headers(headers)

        status = headers.get('status', self.status)
        string_list = [
            'HTTP/1.1 %d %s' % (status, STATUSES[status]),
        ]

        if 'date' in headers:
            string_list.append('date: %s' % headers.pop('date'))

        if not self.forcing_headers:
            content_type = headers.pop('content-type',
                                       'text/plain; charset=utf-8')

            content_length = headers.pop('content-length', self.body_length)

            string_list.append('content-type: %s' % content_type)
            if not self.streaming:
                string_list.append('content-length: %s' % content_length)

            string_list.append('server: %s' % headers.pop('server'))

        for k, v in headers.items():
            string_list.append('{0}: {1}'.format(k, v), )

        for item in string_list:
            fk.write(utf8(item) + b'\n')

        fk.write(b'\r\n')

        if self.streaming:
            body, b = itertools.tee(body)
            for chunk in b:
                fk.write(utf8(chunk))
        else:
            fk.write(utf8(body))

        fk.seek(0)

Example #53

0

Show file

File: language.py Project: xuesj/spaCy

    def pipe(self,
             texts,
             as_tuples=False,
             n_threads=2,
             batch_size=1000,
             disable=[],
             cleanup=False):
        """Process texts as a stream, and yield `Doc` objects in order.

        texts (iterator): A sequence of texts to process.
        as_tuples (bool):
            If set to True, inputs should be a sequence of
            (text, context) tuples. Output will then be a sequence of
            (doc, context) tuples. Defaults to False.
        n_threads (int): Currently inactive.
        batch_size (int): The number of texts to buffer.
        disable (list): Names of the pipeline components to disable.
        cleanup (bool): If True, unneeded strings are freed,
            to control memory use. Experimental.
        YIELDS (Doc): Documents in the order of the original text.

        EXAMPLE:
            >>> texts = [u'One document.', u'...', u'Lots of documents']
            >>>     for doc in nlp.pipe(texts, batch_size=50, n_threads=4):
            >>>         assert doc.is_parsed
        """
        if as_tuples:
            text_context1, text_context2 = itertools.tee(texts)
            texts = (tc[0] for tc in text_context1)
            contexts = (tc[1] for tc in text_context2)
            docs = self.pipe(texts,
                             n_threads=n_threads,
                             batch_size=batch_size,
                             disable=disable)
            for doc, context in izip(docs, contexts):
                yield (doc, context)
            return
        docs = (self.make_doc(text) for text in texts)
        for name, proc in self.pipeline:
            if name in disable:
                continue
            if hasattr(proc, 'pipe'):
                docs = proc.pipe(docs,
                                 n_threads=n_threads,
                                 batch_size=batch_size)
            else:
                # Apply the function, but yield the doc
                docs = _pipe(proc, docs)
        # Track weakrefs of "recent" documents, so that we can see when they
        # expire from memory. When they do, we know we don't need old strings.
        # This way, we avoid maintaining an unbounded growth in string entries
        # in the string store.
        recent_refs = weakref.WeakSet()
        old_refs = weakref.WeakSet()
        # Keep track of the original string data, so that if we flush old strings,
        # we can recover the original ones. However, we only want to do this if we're
        # really adding strings, to save up-front costs.
        original_strings_data = None
        nr_seen = 0
        for doc in docs:
            yield doc
            if cleanup:
                recent_refs.add(doc)
                if nr_seen < 10000:
                    old_refs.add(doc)
                    nr_seen += 1
                elif len(old_refs) == 0:
                    old_refs, recent_refs = recent_refs, old_refs
                    if original_strings_data is None:
                        original_strings_data = list(self.vocab.strings)
                    else:
                        keys, strings = self.vocab.strings._cleanup_stale_strings(
                            original_strings_data)
                        self.vocab._reset_cache(keys, strings)
                        self.tokenizer._reset_cache(keys)
                    nr_seen = 0

Example #54

0

Show file

File: rosalind_grph.py Project: neumann-mlucas/code_exercises

def pairwise(iterable):
    a, b = tee(iterable)
    next(b, None)
    return zip(a, b)

Example #55

0

Show file

File: itertools07.py Project: nomissbowling/try-python

    def exec(self):
        # -----------------------------------------------
        # itertools.starmap()
        # ----------------------
        # グループ化済みの iterable に対して function を適用する。
        # 例えば、zipした後の結果に対して、更に function 適用するなど。
        #
        # つまり、以下のような感じ。
        #
        # l1 = [1, 2, 3]
        # l2 = [9, 8, 7]
        # l3 = list(zip(l1, l2)) ==> [(1,9), (2,8), (3,7)]
        #
        # l3に対して operator.add で starmapする
        # list(itertools.starmap(operator.add, l3))
        #
        # 結果は [10, 10, 10] となる。
        # つまり、以下を実施したのと同じこと
        #
        # for item in l3:
        #     operator.add(*item)
        #
        # なので、名前が starmap となっている
        # -----------------------------------------------
        hr('it.starmap()')

        list01 = [9, 8, 7]
        list02 = [1, 2, 3]
        list03 = list(zip(list01, list02))

        starmap = it.starmap(ope.sub, list03)
        pr('it.starmap', list(starmap))

        list04 = list(zip(list01, list02, *list03))
        pr('it.starmap', list(it.starmap(lambda *args: sum(args), list04)))

        # -----------------------------------------------
        # itertools.takewhile()
        # ----------------------
        # 指定した条件を満たす間、要素を返す。
        # dropwhile() の 逆。
        #
        # なので、一度でも条件から外れた場合、それ以降に
        # 条件を満たす値があっても要素は返らない。
        # -----------------------------------------------
        hr('it.takewhile()')

        list05 = sorted(it.chain(list01, list02))
        pr('list05', list05)

        takewhile = it.takewhile(lambda x: x < 5, list05)
        pr('it.takewhile', list(takewhile))

        # -----------------------------------------------
        # itertools.tee()
        # ----------------------
        # 指定された iterable を複数の独立した iterable にして返す。
        # つまり、n=2 とすると、元の iterable を複製した
        # 二つの iterable が取得できる。(tuple(iterable, iterable))
        #
        # 公式ドキュメントに記載されているように、一度 tee() を
        # 使用して分割した original iterable は、内部状態を共有しているので
        # もう別の場所では利用しないほうがいい。
        #
        # 引用：
        # Once tee() has made a split,
        # the original iterable should not be used anywhere else;
        # otherwise, the iterable could get advanced
        # without the tee objects being informed.
        # -----------------------------------------------
        hr('it.tee()')

        list06 = list('helloworld')

        it_tee = it.tee(list06, 2)
        it_asc, it_desc = it_tee[0], reversed(list(it_tee[-1]))
        for it01, it02 in zip(it_asc, it_desc):
            pr('it.tee', f'{it01}, {it02}')

Example #56

0

Show file

def pairwise(iterable):
    "s -> (s0,s1), (s1,s2), (s2, s3), ..."
    from itertools import tee
    a, b = tee(iterable)
    next(b, None)
    return list(zip(a, b))

Example #57

0

Show file

def _make_pair_range(N):
    from itertools import tee
    i, j = tee(range(-1, N))
    next(j, None)
    return zip(i, j)

Example #58

0

Show file

def window(seq, n):
    els = tee(seq, n)
    for i, el in enumerate(els):
        for _ in range(i):
            next(el, None)
    return zip(*els)

Example #59

0

Show file

File: niimg_conversions.py Project: Raniac/NEURO-LEARN

def concat_niimgs(niimgs,
                  dtype=np.float32,
                  ensure_ndim=None,
                  memory=Memory(cachedir=None),
                  memory_level=0,
                  auto_resample=False,
                  verbose=0):
    """Concatenate a list of 3D/4D niimgs of varying lengths.

    The niimgs list can contain niftis/paths to images of varying dimensions
    (i.e., 3D or 4D) as well as different 3D shapes and affines, as they
    will be matched to the first image in the list if auto_resample=True.

    Parameters
    ----------
    niimgs: iterable of Niimg-like objects or glob pattern
        See http://nilearn.github.io/manipulating_images/input_output.html
        Niimgs to concatenate.

    dtype: numpy dtype, optional
        the dtype of the returned image

    ensure_ndim: integer, optional
        Indicate the dimensionality of the expected niimg. An
        error is raised if the niimg is of another dimensionality.

    auto_resample: boolean
        Converts all images to the space of the first one.

    verbose: int
        Controls the amount of verbosity (0 means no messages).

    memory : instance of joblib.Memory or string
        Used to cache the resampling process.
        By default, no caching is done. If a string is given, it is the
        path to the caching directory.

    memory_level : integer, optional
        Rough estimator of the amount of memory used by caching. Higher value
        means more memory for caching.

    Returns
    -------
    concatenated: nibabel.Nifti1Image
        A single image.

    See Also
    --------
    nilearn.image.index_img

    """
    from ..image import new_img_like  # avoid circular imports

    target_fov = 'first' if auto_resample else None

    # We remove one to the dimensionality because of the list is one dimension.
    ndim = None
    if ensure_ndim is not None:
        ndim = ensure_ndim - 1

    # If niimgs is a string, use glob to expand it to the matching filenames.
    niimgs = _resolve_globbing(niimgs)

    # First niimg is extracted to get information and for new_img_like
    first_niimg = None

    iterator, literator = itertools.tee(iter(niimgs))
    try:
        first_niimg = check_niimg(next(literator), ensure_ndim=ndim)
    except StopIteration:
        raise TypeError('Cannot concatenate empty objects')
    except DimensionError as exc:
        # Keep track of the additional dimension in the error
        exc.increment_stack_counter()
        raise

    # If no particular dimensionality is asked, we force consistency wrt the
    # first image
    if ndim is None:
        ndim = len(first_niimg.shape)

    if ndim not in [3, 4]:
        raise TypeError('Concatenated images must be 3D or 4D. You gave a '
                        'list of %dD images' % ndim)

    lengths = [first_niimg.shape[-1] if ndim == 4 else 1]
    for niimg in literator:
        # We check the dimensionality of the niimg
        try:
            niimg = check_niimg(niimg, ensure_ndim=ndim)
        except DimensionError as exc:
            # Keep track of the additional dimension in the error
            exc.increment_stack_counter()
            raise
        lengths.append(niimg.shape[-1] if ndim == 4 else 1)

    target_shape = first_niimg.shape[:3]
    if dtype == None:
        dtype = first_niimg.get_data().dtype
    data = np.ndarray(target_shape + (sum(lengths), ), order="F", dtype=dtype)
    cur_4d_index = 0
    for index, (size, niimg) in enumerate(
            izip(
                lengths,
                _iter_check_niimg(iterator,
                                  atleast_4d=True,
                                  target_fov=target_fov,
                                  memory=memory,
                                  memory_level=memory_level))):

        if verbose > 0:
            if isinstance(niimg, _basestring):
                nii_str = "image " + niimg
            else:
                nii_str = "image #" + str(index)
            print("Concatenating {0}: {1}".format(index + 1, nii_str))

        data[..., cur_4d_index:cur_4d_index + size] = niimg.get_data()
        cur_4d_index += size

    return new_img_like(first_niimg,
                        data,
                        first_niimg.affine,
                        copy_header=True)

Example #60

0

Show file

File: 2_benchmark.py Project: astarsky2016/Latent-Taxonomic-Signatures

    		else:
    			selected_proteome = accs
    		query_accs = random.sample(selected_proteome, size)
    		fastas500 = list([records[el] for el in query_accs])
    		subject_accs = set(selected_proteome).difference(set(query_accs))
    		train_fastas = list([records[el] for el in subject_accs])
    		subsample100_accs = random.sample(query_accs, 100)
    		subsample50_accs = random.sample(subsample100_accs, 50)
    		fastas100 = list([records[el] for el in subsample100_accs])
    		fastas50 = list([records[el] for el in subsample50_accs])
    		fastas = [train_fastas, fastas500, fastas100, fastas50]
    		print ("working", cnt)
    		for pos, selection in enumerate(fastas):
    			text = ""
    			for s in selection:
    				trigrams = list(zip(*(islice(seq, index, None) for index, seq in enumerate(tee(s, 3)))))
    				trigram_protein = "|".join(["".join(el) for el in trigrams]) + "|"
    				text += trigram_protein
    			store.execute("INSERT INTO {} VALUES (?, ?)".format(tables[pos]), (id, text[0:-1]))
    			map_benchmark[id] = {"test500":query_accs, "test100":subsample100_accs, "test50":subsample50_accs}
    		conn.commit()
    		cnt += 1
    		print(cnt)
            docs_above1000.append(id)
	print ("texts extracted")
	store.execute("CREATE INDEX taxon_idx_test500 ON texts_test500(taxon_id)")
	store.execute("CREATE INDEX taxon_idx_test100 ON texts_test100(taxon_id)")
	store.execute("CREATE INDEX taxon_idx_test50 ON texts_test50(taxon_id)")
	store.execute("CREATE INDEX taxon_idx_model ON texts_train(taxon_id)")
	conn.commit()
	pickle.dump(map_benchmark, open("benchmark_accs2id.p", "wb"))