Exemple #1
0
 def run_reduce(self):
     self.stopped_received = 0
     self.merged_files = []
     merged_iterator = None
     while True:
         # Iterate and merge files until all jobs are processed
         get_next = self.get_next_file()
         files = get_next
         # itertools.islice(get_next, self.reduce_max_files)
         all_files = [file for file in files]
         iterables = [self.iter_on_file(file) for file in all_files]
         merged_iterator = heapq.merge(*iterables)
         if self.stopped_received < self.numprocs:
             if self.debug:
                 debug_print("Performing intermediate merge on %u  files" % len(iterables))
             f = TemporaryFile()
             self.merged_files.append(f)
             for m in merged_iterator:
                 cPickle.dump(m, f, cPickle.HIGHEST_PROTOCOL)
             f.seek(0)
             f.flush()
         else:
             break
     if len(self.merged_files) > 0:
         if self.debug:
             debug_print("Final merge")
         # Final merge if required
         merged_iterator = heapq.merge(
             *([self.iter_on_file(stream) for stream in self.merged_files] + [merged_iterator])
         )
     if self.debug:
         debug_print("Reduce loop")
     result = self.reduce_loop(merged_iterator)
     return result
Exemple #2
0
def main():
    parser = argparse.ArgumentParser(description='Merge multiple sorted files')
    parser.add_argument('-c', '--col', type=int, default=-1, help='column index (starts from 0)')
    parser.add_argument('-n', '--numeric', action='store_true', help='numeric sort')
    parser.add_argument('--sep', type=str, default=',', help='column separator')
    parser.add_argument('files', metavar='files', type=str, nargs='+', help='input files')

    args = parser.parse_args()
    col = args.col
    numeric = args.numeric
    sep = six.u(args.sep)

    files = [open(filename, 'r') for filename in args.files]

    if col != -1:
        if numeric:
            conv = float
        else:
            conv = str

        tuples = (
            ((conv(line.split(sep)[col]), line) for line in file)
            for file in files
        )
        merged = heapq.merge(*tuples)
        for key, line in merged:
            sys.stdout.write(line)
    else:
        merged = heapq.merge(*files)
        for line in merged:
            sys.stdout.write(line)
Exemple #3
0
def imerge(its, key=None):
  '''
  Generator to efficiently merge sorted iterables with optional key
  function.  Input sequence must be in native order if no key function is
  not specified, or in key order if a key function is specified.

  Equivalent, but much more efficient than:

    iter(sorted(*chain(*its), key=key))

  so long as the following invariant holds:

    for it in its:
      it = list(it)
      assert it == sorted(it,key=key)

  @param  its: list of sorted input sequences
  @type   its: sequence
  @param  key: optional key comparison function
  @type   key: binary function
  @return:     sequence of sorted values
  @rtype:      iterator

  >>> list(imerge([range(5),range(5,11)]))
  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
  >>> list(imerge([reversed(range(5)),reversed(range(5,11))],key=lambda x: -x))
  [10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
  '''
  if key is None:
    return merge(*its)

  its_d = [ ((key(item),item) for item in it) for it in its ]

  return imap(itemgetter(1), merge(*its_d))
Exemple #4
0
def exp_time():
    part1 = [(0, frozenset())]
    for i in range(n/2):
        part1 = [t for t in merge(part1,
                                  [(val+act[i][1], s.union({act[i][0]}))
                                   for (val,s) in part1])]      
    part2 = [(0, frozenset())]
    for i in range(n/2, n):
        part2 = [t for t in merge(part2,
                                  [(val+act[i][1], s.union({act[i][0]}))
                                   for (val,s) in part2])]
    i = 0
    j = len(part2) - 1
    while i < len(part1) and j >= 0:
        sum = part1[i][0] + part2[j][0]
        if (sum > 0):
            j -= 1
        elif (sum < 0):
            i += 1
        elif len(part1[i][1]) == 0 and len(part2[j][1]) == 0:
            i += 1 # or j -= 1, it's indifferent
        else:
            for x in part1[i][1].union(part2[j][1]):
               print i
            exit(0)
    def get_req_data(self, req_type):
        if req_type is None:
            successes = [successes for (successes, _) in self.data_by_type.values()]
            failures = [failures for (_, failures) in self.data_by_type.values()]
            return (
                list(heapq.merge(*successes)),
                list(heapq.merge(*failures)),
            )

        return self.data_by_type[req_type]
def merge(key=None, *iterables):
    if key is None:
        for element in heapq.merge(*iterables):
            yield element
    else:
        Keyed = namedtuple("Keyed", ["key", "obj"])
        keyed_iterables = [(Keyed(key(obj), obj) for obj in iterable)
                           for iterable in iterables]
        for element in heapq.merge(*keyed_iterables):
            yield element.obj
Exemple #7
0
def merge_temps(temp_files, out_path, keyfunc=key_func):
	"""This method opens all the temp files and merges them together,
	adding them to the final file that contains the entire lexicon. 
	NOTE: TAKEN FROM: http://stackoverflow.com/questions/1001569/python-class-to-merge-sorted-files-how-can-this-be-improved
	and uses the merge from heapq."""

	incrementalMergeFiles = []

	if len(temp_files)>200:
		#First merge in groups of 50 into temp files
		for group in grouper(temp_files, 50):
			current_term = ''
			combined_posting_list = ''
			files = map(open, group)
			os_temp, temp_file_name = tempfile.mkstemp()
			for line in heapq.merge(*[decorated_file(f, keyfunc) for f in files]):
				if current_term == '':
					current_term = line[0]
					combined_posting_list = line[1]
				elif line[0] == current_term:
					combined_posting_list += line[1]
				elif line[0] != current_term:
					os.write(os_temp, (current_term + '>' + combined_posting_list + '\n'))
					current_term = line[0]
					combined_posting_list = line[1]
			for openfile in files:
				openfile.close()
			incrementalMergeFiles.append(temp_file_name)
			os.close(os_temp)
	else:
		incrementalMergeFiles = temp_files

	files = map(open, incrementalMergeFiles)
	current_term = ''
	combined_posting_list = ''
	
	with open(out_path, 'w') as outfile:
		#now merge the files into the file path
		for line in heapq.merge(*[decorated_file(f, keyfunc) for f in files]):
			if current_term == '':
				current_term = line[0]
				combined_posting_list = line[1]
			elif line[0] == current_term:
				combined_posting_list += line[1]
			elif line[0] != current_term:
				outfile.write(current_term + '>' + combined_posting_list + '\n')
				current_term = line[0]
				combined_posting_list = line[1]

	for openfile in files:
		openfile.close()
	for temp in temp_files:
		os.remove(temp)
	del temp_files
Exemple #8
0
def merge(key=None, *iterables):
    # based on code posted by Scott David Daniels in c.l.p.
    # http://groups.google.com/group/comp.lang.python/msg/484f01f1ea3c832d

    if key is None:
        for element in heapq.merge(*iterables):
            yield element
    else:
        keyed_iterables = [(Keyed(key(obj), obj) for obj in iterable)
                        for iterable in iterables]
        for element in heapq.merge(*keyed_iterables):
            yield element.obj
Exemple #9
0
def _heapqmergesorted(key=None, *iterables):
    """Return a single iterator over the given iterables, sorted by the
    given `key` function, assuming the input iterables are already sorted by
    the same function. (I.e., the merge part of a general merge sort.) Uses
    :func:`heapq.merge` for the underlying implementation."""

    if key is None:
        keyed_iterables = iterables
        for element in heapq.merge(*keyed_iterables):
            yield element
    else:
        keyed_iterables = [(_Keyed(key(obj), obj) for obj in iterable)
                           for iterable in iterables]
        for element in heapq.merge(*keyed_iterables):
            yield element.obj
Exemple #10
0
def lines_and_annotations(lines, htmlifiers):
    """Collect all the annotations for each line into a list, and yield a tuple
    of (line of HTML, annotations list) for each line.

    :arg lines: An iterable of Markup objects, each representing a line of
        HTMLified source code

    """
    def non_sparse_annotations(annotations):
        """De-sparsify the annotations iterable so we can just zip it together
        with the HTML lines.

        Return an iterable of annotations iterables, one for each line.

        """
        next_unannotated_line = 0
        for line, annotations in groupby(annotations, itemgetter(0)):
            for next_unannotated_line in xrange(next_unannotated_line,
                                                line - 1):
                yield []
            yield [data for line_num, data in annotations]
            next_unannotated_line = line
    return izip_longest(lines,
                        non_sparse_annotations(merge(*[h.annotations() for h in
                                                       htmlifiers])),
                        fillvalue=[])
  def map_within(self, other):
    """'other' should be a subset of 'self'.  Returns a RangeSet
    representing what 'other' would get translated to if the integers
    of 'self' were translated down to be contiguous starting at zero.

    >>> RangeSet("0-9").map_within(RangeSet("3-4"))
    <RangeSet("3-4")>
    >>> RangeSet("10-19").map_within(RangeSet("13-14"))
    <RangeSet("3-4")>
    >>> RangeSet("10-19 30-39").map_within(RangeSet("17-19 30-32"))
    <RangeSet("7-12")>
    >>> RangeSet("10-19 30-39").map_within(RangeSet("12-13 17-19 30-32"))
    <RangeSet("2-3 7-12")>
    """

    out = []
    offset = 0
    start = None
    for p, d in heapq.merge(zip(self.data, itertools.cycle((-5, +5))),
                            zip(other.data, itertools.cycle((-1, +1)))):
      if d == -5:
        start = p
      elif d == +5:
        offset += p-start
        start = None
      else:
        out.append(offset + p - start)
    return RangeSet(data=out)
Exemple #12
0
def batch_sort(input_iterator, output_path, buffer_size=32000, output_class=None):
    """batch sort helper with temporary files, supports sorting large stuff"""
    if not output_class:
        output_class = input_iterator.__class__

    chunks = []
    try:
        while True:
            current_chunk = list(islice(input_iterator, buffer_size))
            if not current_chunk:
                break

            current_chunk.sort()
            fd, filepath = tempfile.mkstemp()
            os.close(fd)
            output_chunk = output_class(filepath)
            chunks.append(output_chunk)

            for elem in current_chunk:
                output_chunk.write(elem.obj)
            output_chunk.close()

        output_file = output_class(output_path)
        for elem in heapq.merge(*chunks):
            output_file.write(elem.obj)
        output_file.close()
    finally:
        for chunk in chunks:
            try:
                chunk.close()
                os.remove(chunk.name)
            except Exception:
                pass
Exemple #13
0
def collate(*iterables, **kwargs):
    """Return a sorted merge of the items from each of several already-sorted
    ``iterables``.

        >>> list(collate('ACDZ', 'AZ', 'JKL'))
        ['A', 'A', 'C', 'D', 'J', 'K', 'L', 'Z', 'Z']

    Works lazily, keeping only the next value from each iterable in memory. Use
    ``collate()`` to, for example, perform a n-way mergesort of items that
    don't fit in memory.

    :arg key: A function that returns a comparison value for an item. Defaults
        to the identity function.
    :arg reverse: If ``reverse=True``, yield results in descending order
        rather than ascending. ``iterables`` must also yield their elements in
        descending order.

    If the elements of the passed-in iterables are out of order, you might get
    unexpected results.

    If neither of the keyword arguments are specified, this function delegates
    to ``heapq.merge()``.

    """
    if not kwargs:
        return merge(*iterables)

    return _collate(*iterables, **kwargs)
Exemple #14
0
    def _index_merge_warcs(self, new_warcs, index_file, rel_root=None):
        cdx_file = os.path.join(self.indexes_dir, index_file)

        temp_file = cdx_file + '.tmp.' + timestamp20_now()
        self._cdx_index(temp_file, new_warcs, rel_root)

        # no existing file, so just make it the new file
        if not os.path.isfile(cdx_file):
            shutil.move(temp_file, cdx_file)
            return

        merged_file = temp_file + '.merged'

        last_line = None

        with open(cdx_file, 'rb') as orig_index:
            with open(temp_file, 'rb') as new_index:
                with open(merged_file, 'w+b') as merged:
                    for line in heapq.merge(orig_index, new_index):
                        if last_line != line:
                            merged.write(line)
                            last_line = line

        shutil.move(merged_file, cdx_file)
        #os.rename(merged_file, cdx_file)
        os.remove(temp_file)
def merge_sorted_seq(seq1, seq2):
    ''' merge two sorted sequences with little ovehead. the result
        will be sorted, which is different of doing just +'''
    result = []
    for c in heapq.merge(seq1, seq2):
        result.append(c)
    return result
Exemple #16
0
def logmerge(options, args):
    """Perform merge on multiple input logfiles
    and emit in sorted order using a priority queue"""
    
    delimiter = options.delimiter
    field = options.field

    key_func = None
    if options.get('parser', None):
        # Use a parser to extract field to merge/sort by
        parser = eval(options.parser, vars(logtools.parsers), {})()
        if field.isdigit():            
            extract_func = lambda x: parser(x.strip()).by_index(int(field)-1)
        else:
            extract_func = lambda x: parser(x.strip())[field]
    else:
        # No parser given, use indexed field based extraction
        extract_func = lambda x: x.strip().split(delimiter)[int(field)-1]
        
    if options.get('numeric', None):
        key_func = lambda x: (int(extract_func(x)), x)
    elif options.get('datetime', None):
        key_func = lambda x: (datetime.strptime(extract_func(x), \
                                    options.dateformat), x)            
    else:
        key_func = lambda x: (extract_func(x), x)
        
    iters = (imap(key_func, open(filename, "r")) for filename in args)
    
    for k, line in merge(*iters):
        yield k, line.strip()
Exemple #17
0
    def _list(self, project=None, marker=None,
              limit=storage.DEFAULT_QUEUES_PER_PAGE, detailed=False):

        def all_pages():
            pool = self._pool_catalog.get_default_pool()
            if pool is None:
                raise errors.NoPoolFound()
            yield next(pool.queue_controller.list(
                project=project,
                marker=marker,
                limit=limit,
                detailed=detailed))

        # make a heap compared with 'name'
        ls = heapq.merge(*[
            utils.keyify('name', page)
            for page in all_pages()
        ])

        marker_name = {}

        # limit the iterator and strip out the comparison wrapper
        def it():
            for queue_cmp in itertools.islice(ls, limit):
                marker_name['next'] = queue_cmp.obj['name']
                yield queue_cmp.obj

        yield it()
        yield marker_name and marker_name['next']
    def _merge_sorts(self):
        iterators = []
        for i in range(1, self.batch_number):
            extra = "_batch" + str(i)
            new_path = brenninc_utils.create_new_file(self.fastq_file, extra,
                                                      outputdir=self.outputdir,
                                                      gzipped=False)
            iterable = wrap_sequence(new_path)
            iterators.append(iterable)
        big = heapq.merge(*iterators)
        extra = "_sorted"
        new_path = brenninc_utils.create_new_file(self.fastq_file, extra,
                                                  outputdir=self.outputdir,
                                                  gzipped=False)
        print "writing to", new_path
        with open(new_path, 'w') as sorted_file:
            for wrapper in big:
                wrapper.sequence.write_to_fastq_file(sorted_file)
        print "done"

        for i in range(1, self.batch_number):
            extra = "_batch" + str(i)
            new_path = brenninc_utils.create_new_file(self.fastq_file, extra,
                                                      outputdir=self.outputdir,
                                                      gzipped=False)
            os.remove(new_path)
Exemple #19
0
 def testSharding(self):
     "Sharding of indices across servers."
     keys = range(len(self.hosts))
     shards = client.Shards(zip(self.hosts * 2, heapq.merge(keys, keys)), limit=1)
     shards.resources.broadcast('PUT', '/fields/zone', {'store': 'yes'})
     for zone in range(len(self.ports)):
         shards.broadcast(zone, 'POST', '/docs', [{'zone': str(zone)}])
     shards.resources.broadcast('POST', '/update')
     result = shards.unicast(0, 'GET', '/search?q=zone:0')()
     assert result['count'] == len(result['docs']) == 1
     assert all(response() == result for response in shards.broadcast(0, 'GET', '/search?q=zone:0'))
     response, = shards.multicast([0], 'GET', '/search')
     assert set(doc['zone'] for doc in response()['docs']) > set('0')
     response, = shards.multicast([0, 1], 'GET', '/search')
     assert set(doc['zone'] for doc in response()['docs']) == set('01')
     zones = set()
     responses = shards.multicast([0, 1, 2], 'GET', '/search')
     assert len(responses) == 2
     for response in responses:
         docs = response()['docs']
         assert len(docs) == 2
         zones.update(doc['zone'] for doc in docs)
     assert zones == set('012')
     self.stop(self.servers.pop(0))
     self.assertRaises(socket.error, shards.broadcast, 0, 'GET', '/')
     responses = shards.multicast([0, 1, 2], 'GET', '/')
     assert len(responses) == 2 and all(response() for response in responses)
     shards.resources.priority = lambda hosts: None
     self.assertRaises(ValueError, shards.choice, [[0]])
Exemple #20
0
    def add(self, m, transaction, link, p1, p2, added, removed):
        if (p1 in self._mancache and not self._treeinmem
            and not self._usemanifestv2):
            # If our first parent is in the manifest cache, we can
            # compute a delta here using properties we know about the
            # manifest up-front, which may save time later for the
            # revlog layer.

            _checkforbidden(added)
            # combine the changed lists into one sorted iterator
            work = heapq.merge([(x, False) for x in added],
                               [(x, True) for x in removed])

            arraytext, deltatext = m.fastdelta(self._mancache[p1][1], work)
            cachedelta = self.rev(p1), deltatext
            text = util.buffer(arraytext)
            n = self.addrevision(text, transaction, link, p1, p2, cachedelta)
        else:
            # The first parent manifest isn't already loaded, so we'll
            # just encode a fulltext of the manifest and pass that
            # through to the revlog layer, and let it handle the delta
            # process.
            if self._treeondisk:
                m1 = self.read(p1)
                m2 = self.read(p2)
                n = self._addtree(m, transaction, link, m1, m2)
                arraytext = None
            else:
                text = m.text(self._usemanifestv2)
                n = self.addrevision(text, transaction, link, p1, p2)
                arraytext = array.array('c', text)

        self._mancache[n] = (m, arraytext)

        return n
Exemple #21
0
def main():
    a = [1,2,3,4,5,6,7,8]

    hq.heapify(a)
    print("heapq.heapify({})".format(a))

    hq.heappush(a, 9)
    print("heapq.heappush('heap', 9) = {}".format(a))

    hq.heappop(a)
    print("heapq.heappop('heap') = {}".format(a))

    y = hq.heappushpop(a, 16)
    print("heapq.heappushpop('heap', 16) = ({}), {}".format(y, a))

    y = hq.heapreplace(a, 1)
    print("heapq.heapreplace('heap', 1) = ({}), {}".format(y, a))

    y = hq.nlargest(2, enumerate(a))
    print("heapq.nlargest(2, 'heap') = {}".format(y))

    y = hq.nsmallest(2, a)
    print("heapq.nsmallest(2, 'heap') = {}".format(y))
   
    y = hq.merge(a, [94, 34,12,56,83])
    print("heapq.merge('heap', [94, 34,12,56,83]) = {}".format(y))
Exemple #22
0
    def list(self, project=None, marker=None,
             limit=storage.DEFAULT_QUEUES_PER_PAGE, detailed=False):

        def all_pages():
            for shard in self._shard_catalog._shards_ctrl.list(limit=0):
                yield next(self._shard_catalog.get_driver(shard['name'])
                           .queue_controller.list(
                               project=project,
                               marker=marker,
                               limit=limit,
                               detailed=detailed))

        # make a heap compared with 'name'
        ls = heapq.merge(*[
            utils.keyify('name', page)
            for page in all_pages()
        ])

        marker_name = {}

        # limit the iterator and strip out the comparison wrapper
        def it():
            for queue_cmp in itertools.islice(ls, limit):
                marker_name['next'] = queue_cmp.obj['name']
                yield queue_cmp.obj

        yield it()
        yield marker_name['next']
Exemple #23
0
    def _dispatch(self, message, queue=None):
        if queue == None:
            queue = list(heapq.merge(self._global_handlers, self._message_handlers[type(message.body)]))
        try:
            for priority, callback in queue:
                try:
                    if self._verbose:
                        LOG.debug("invoking %s (priority=%s): %s", callback, priority, message)
                    self.invoke(callback, message)
                except AbortProcessing:
                    raise
                except Exception, ex:
                    LOG.exception("Callback failed: %s. Failed to send message: %s", callback, message)
                    if self.raise_errors:
                        raise
                    # avoid a circular loop
                    if queue == self._error_handlers:
                        continue
                    try:
                        self._send_error(message, callback, ex)
                    except AbortProcessing:
                        raise
                    except:
                        LOG.exception("Failed to send error. This generally should not happen.")

        except AbortProcessing:
            LOG.info("processing aborted.""")
            self.trx._deferred = []
            return

        while self.trx._deferred:
            LOG.info("sending queued_message")
            self._send_message(self.trx._deferred.pop(0))
def main():
    """
    main entry point

    return 0 for success (exit code)
    """
    return_value = 0

    initialize_logging(_log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    halt_event = Event()
    set_signal_handler(halt_event)

    node_generators = _start_subprocesses(halt_event)
    merge_manager = heapq.merge(*node_generators)

    try:
        _manage_subprocesses(halt_event, merge_manager)
    except Exception as instance:
        log.exception(instance)
        return_value = 1

    return return_value
def calculate_results(host, events):

    perf_tracker = perf.PerformanceTracker(host.sim_params)

    all_events = heapq.merge(
        ((event.dt, event) for event in events),
        ((event.dt, event) for event in host.benchmark_events))

    filtered_events = [(date, filt_event) for (date, filt_event)
                       in all_events if date <= events[-1].dt]
    filtered_events.sort(key=lambda x: x[0])
    grouped_events = itertools.groupby(filtered_events, lambda x: x[0])
    results = []

    bm_updated = False
    for date, group in grouped_events:
        for _, event in group:
            perf_tracker.process_event(event)
            if event.type == DATASOURCE_TYPE.BENCHMARK:
                bm_updated = True
        if bm_updated:
            msg = perf_tracker.handle_market_close()
            results.append(msg)
            bm_updated = False
    return results
Exemple #26
0
def dsorted(iterable, buffer_size=1e6, tempdir="."):
    from disco.compat import pickle_load, pickle_dump
    from heapq import merge
    from itertools import islice
    from tempfile import TemporaryFile

    def read(handle):
        while True:
            try:
                yield pickle_load(handle)
            except EOFError:
                return

    iterator = iter(iterable)
    subiters = []
    while True:
        buffer = sorted(islice(iterator, buffer_size))
        handle = TemporaryFile(dir=tempdir)
        for item in buffer:
            pickle_dump(item, handle, -1)
        handle.seek(0)
        subiters.append(read(handle))
        if len(buffer) < buffer_size:
            break
    return merge(*subiters)
Exemple #27
0
def count_variants(args):

    print('counting variants in legend file(s)', ' '.join(args.legend), file=sys.stderr)

    ## file1 and file2 can be identical if only one file is provided
    file1 = args.legend[0]
    file2 = args.legend[-1]

    with gzip.open(file1, 'rt') as fd1, gzip.open(file2, 'rt') as fd2:
        fd1.readline()
        fd2.readline()
        it1 = key_generator(fd1)
        it2 = key_generator(fd2)
        prev_index = None
        d_index2variants = {}
        for key, line in heapq.merge(it1, it2):
            pos = key[0]
            index = 1+(pos-1)//args.int_size
            if index != prev_index:
                set_variants = set([key])
                prev_index = index
                print(index, file=sys.stderr)
            else:
                set_variants.add(key)
            d_index2variants[index] = len(set_variants)

    pos_max = pos

    return d_index2variants, pos_max
Exemple #28
0
def sort_diskbased(stream, field, nsize=100000):
    buf = []
    files = []
    count = 0
    t = None

    def iter_on_file(f):
        try:
            while True:
                (key, v) = cPickle.load(f)
                yield (key, t._make(v))
        except EOFError:
            f.close()
    for elt in stream:
        if isinstance(elt, StreamHeader):
            t = elt.t
            yield elt
        elif isinstance(elt, StreamFooter):
            buf.sort()
            iterables = [iter_on_file(f) for f in files] + [itertools.imap(lambda obj: (getattr(obj, field), obj), buf)]
            for (k, row) in heapq.merge(*iterables):
                yield row
            yield elt
        else:
            buf.append(elt)
            count = count + 1
            if count % nsize == 0:
                buf.sort(key=lambda obj: getattr(obj, field))
                f = TemporaryFile()
                for item in buf:
                    cPickle.dump((getattr(item, field), list(item)), f, cPickle.HIGHEST_PROTOCOL)
                f.flush()
                files.append(f)
                del buf[:]
Exemple #29
0
    def prepare_input_map(self, task, stage, params):
        # The input map maps a label to a sequence of inputs with that
        # label.
        map = defaultdict(list)

        for l, i in util.chainify(self.labelexpand(task, stage, i, params) for i in self.get_inputs()):
            if stage.combine_labels:
                map[0].append(i)
            else:
                map[l].append(i)

        if stage.sort:
            newmap = {}
            if stage.input_sorted:
                for label, inputs in map.iteritems():
                    input = merge(*(merge_wrapper(inp, sort_range=stage.sort, desc=stage.desc) for inp in inputs))
                    newmap[label] = [input]
            else:
                for label, inputs in map.iteritems():

                    input = chainify(shuffled(inputs))
                    newmap[label] = [disk_sort(input,
                                               task.path('sort.dl'),
                                               sort_keys=stage.sort,
                                               sort_buffer_size='15%',
                                               binaries=stage.binaries,
                                               desc=stage.desc)]
            map = newmap
        #print "OUTSIE: %s" % str(map)
        return map
def get_Nranked_list_kdtree(query_set,N):
	try:
		query_set_ind = [ vocab_index[query] for query in query_set ]
	except KeyError:
		print "query word not present"
		return
	query_vec = [vec[i] for i in query_set_ind]
	#?use distance_upper_bound for set_size queries sequentially
	# query using scipy kdtree
	# d_list,knn_list = kd.query(query_vec,k=N+len(query_set_ind)) #, eps=eps)
	# query using sklearn
	d_list,knn_list = neigh.kneighbors(X=query_vec, n_neighbors=N+len(query_set_ind), return_distance=True)
	#?use heap of size set_size and push top elements from set_size ranked list until N elements are popped
	index_dist_list = []
	for d,index in zip(d_list,knn_list):
		filtered=[(dt,idx) for (dt,idx) in list(zip(d,index)) if idx not in query_set_ind]
		index_dist_list.append(filtered)
	knn=[]
	sel=set()
	count=0
	for (d,idx) in merge(*index_dist_list):
		if idx not in sel:
			sel.add(idx)
			knn.append(vocab[idx])
			count+=1
		if count==N:
			break
	return knn
import heapq

a = [1, 4, 7, 10]
b = [2, 5, 6, 11]
for c in heapq.merge(a, b):
    print(c)

# 合并文件
"""
 特别的,它并不会预先读取所有数据到堆栈中或者预先排序,也不会对输入做任何的排序检测。 
 它仅仅是检查所有序列的开始部分并返回最小的那个,这个过程一直会持续直到所有输入序列中的元素都被遍历完
"""
with open('sorted_file_1', 'rt') as file1, \
        open('sorted_file_2', 'rt') as file2, \
        open('merged_file', 'wt') as outf:
    for line in heapq.merge(file1, file2):
        outf.write(line)
    print '-' * 20

    nums2 = [10, 16, 1, 2, -10, 999, -999]
    heap = []
    for num in nums2:
        heapq.heappush(heap, num)
    print[heapq.heappop(heap) for _ in range(len(nums2))]

    heapq.heapify(nums2)  # 将list类型转化为heap, 在线性时间内, 重新排列列表
    print[heapq.heappop(nums2) for _ in range(len(nums2))]

    print '-' * 20

    num1 = [32, 3, 5, 34, 54, 23, 132]
    num2 = [23, 2, 12, 656, 324, 23, 54]
    num1.sort()
    num2.sort()

    res = heapq.merge(num1, num2)  # 合并多个排序后的序列成一个排序后的序列, 返回排序后的值的迭代器
    ret = []
    for r in res:
        ret.append(r)
    print ret

    print '-' * 20

    nums3 = [1, 2, 3, 4, 5]
    heapq.heapify(nums3)
    heapq.heapreplace(nums3, 23)  # 删除堆中最小元素并加入一个元素
    print[heapq.heappop(nums3) for _ in range(len(nums3))]
Exemple #33
0
def merge_logs(files, offsets, fnamedict = {}):
    #print options.tzoffset
    for (epoch_ms, entry) in heapq.merge(*[decorated_log_split(f, offsets[f.name], epochtimemillis_keyfunc, fnamedict) for f in files]):
        yield (epoch_ms, entry)
 def mergeSortedtempFiles(self):
     mergedNo = (map(int, tempFileHandler) for tempFileHandler in
                 self.sortedTempFileHandlerList)  # mergedNo is a generator which stores all the sorted number in ((1,4,6),(3,7,8)...) format. Since it's generator ,it doesn't stores in memory and do lazy allocation
     sortedCompleteData = heapq.merge(
         *mergedNo)  # uses python heapqmodule that takes a list of sorted iterators and sort it and generates a sorted iterator , So again no more storing of data in memory
     return sortedCompleteData
Exemple #35
0
from heapq import merge

arr1 = [1,3,4,5]   
arr2 = [2,4,6,8]

print(list(merge(arr1,arr2)))
Exemple #36
0
    def merge_postings_list_dict(self, posting_list_dict, max_k):
        '''
        Merges dictionary of postings list for a query
        '''

        # Doc ids of intersection of postings list
        search_doc_id = []
        postings_dict_size_heap = []
        all_postings_list = []
        # print(posting_list_dict)
        for category in posting_list_dict:
            for token in posting_list_dict[category]:
                postings_dict_size_heap.append(
                    (posting_list_dict[category][token]["size"], category,
                     token))
                all_postings_list.append(
                    posting_list_dict[category][token]["postings_list"])

        postings_dict_size_heap_join = list(postings_dict_size_heap)
        # print(postings_dict_size_heap)
        # If only one token
        if len(postings_dict_size_heap) == 1:
            return self.find_top_k(
                posting_list_dict[category][token]["postings_list"], max_k)

        heapq.heapify(postings_dict_size_heap)
        first_merge = True

        while postings_dict_size_heap:
            # print("merging")
            if len(postings_dict_size_heap) > 1 and first_merge:
                posting_tuple_1 = heapq.heappop(postings_dict_size_heap)
                posting_tuple_2 = heapq.heappop(postings_dict_size_heap)
                search_doc_id = self.merge_postings_list(
                    posting_list_dict[posting_tuple_1[1]][posting_tuple_1[2]]
                    ["postings_list"], posting_list_dict[posting_tuple_2[1]][
                        posting_tuple_2[2]]["postings_list"])
                first_merge = False
            else:
                posting_tuple_1 = heapq.heappop(postings_dict_size_heap)
                search_doc_id = self.merge_postings_list(
                    search_doc_id, posting_list_dict[posting_tuple_1[1]][
                        posting_tuple_1[2]]["postings_list"])

        # print("check:", search_doc_id)
        query_result = []
        if len(search_doc_id) < max_k:
            search_doc_id_old = search_doc_id.copy()
            #if no intersection found join all
            merge_iter = heapq.merge(*all_postings_list)
            prev = (-1, -1)
            dict_doc = {}
            if len(search_doc_id) > 0:
                for tuple in search_doc_id:

                    dict_doc[tuple[0]] = 1

            for idx, doc_tuple in enumerate(merge_iter):

                # if same document found add the scores and continue
                if prev[0] == doc_tuple[0]:
                    prev = (doc_tuple[0], doc_tuple[1] + prev[1])
                    continue
                else:

                    if prev[0] > -1 and dict_doc.get(prev[0]) is None:
                        search_doc_id.append(prev)
                    prev = doc_tuple
            # Writing the last doc

            if prev[0] > -1 and dict_doc.get(prev[0]) is None:
                search_doc_id.append(prev)

            query_result_old = self.find_top_k(search_doc_id_old, max_k)
            query_result = self.find_top_k(search_doc_id, max_k)

            doc = []
            for tuple in query_result_old:
                doc.append(tuple[0])

            for tuple in query_result:
                if tuple[0] not in doc:
                    query_result_old.append(tuple)
            query_result = query_result_old

        if len(query_result) == 0:
            query_result = self.find_top_k(search_doc_id, max_k)

        return query_result
Exemple #37
0
    for x in items:
        if isinstance(x,Iterable) and not isinstance(x,ignore_types):
            yield from flatten(x)
        else:
            yield x
items = [1,2,[3,4,[5,6],7],8]

for x in flatten(items):
    print(x)

# 顺序迭代合并后的的排序迭代对象
# 有一系列排序序列,想将它们合并后得到一个排序序列并在上面迭代遍历
import heapq
a = [1,4,7,10]
b = [2,5,6,11]
for c in heapq.merge(a,b):
    print(c)

# heapq.merge() 需要所有输入序列必须是排过序的。特别的,它并不会预先读取所有数据到堆栈中或者预先排序,也不会对输入做任何的排序检测。
# 它仅仅是检查所有序列的开始部分并返回最小的那个,这个过程一直会持续直到所有输入序列中的元素都被遍历完。

# 迭代器代替while无线循环
CHUNKSIZE = 8192
def readers(s):
    while True:
        data = s.recv(CHUNKSIZE)
        if data == b'':
            break
        #process_data(data)

def reader2(s):
Exemple #38
0
 def _get_others_current_hand(self, player):
     player_up = self.players[get_upstream_player_id(player, self.players)]
     player_down = self.players[get_downstream_player_id(
         player, self.players)]
     others_hand = merge(player_up.current_hand, player_down.current_hand, key=functools.cmp_to_key(doudizhu_sort_card))
     return cards2str(list(others_hand))
Exemple #39
0
def read_events(database, all_forks, has_thread_flag):
    # In here, a file is any file on the filesystem. A binary is a file, that
    # gets executed. A process is a system-level task, identified by its pid
    # (pids don't get reused in the database).
    # What I call program is the couple (process, binary), so forking creates a
    # new program (with the same binary) and exec'ing creates a new program as
    # well (with the same process)
    # Because of this, fork+exec will create an intermediate program that
    # doesn't do anything (new process but still old binary). If that program
    # doesn't do anything worth showing on the graph, it will be erased, unless
    # all_forks is True (--all-forks).

    if PY3:
        # On PY3, connect() only accepts unicode
        conn = sqlite3.connect(str(database))
    else:
        conn = sqlite3.connect(database.path)
    conn.row_factory = sqlite3.Row

    # This is a bit weird. We need to iterate on all types of events at the
    # same time, ordering by timestamp, so we decorate-sort-undecorate
    # Decoration adds timestamp (for sorting) and tags by event type, one of
    # 'process', 'open' or 'exec'

    # Reads processes from the database
    process_cursor = conn.cursor()
    if has_thread_flag:
        sql = '''
        SELECT id, parent, timestamp, is_thread
        FROM processes
        ORDER BY id
        '''
    else:
        sql = '''
        SELECT id, parent, timestamp, 0 as is_thread
        FROM processes
        ORDER BY id
        '''
    process_rows = process_cursor.execute(sql)
    processes = {}
    all_programs = []

    # ... and opened files...
    file_cursor = conn.cursor()
    file_rows = file_cursor.execute('''
        SELECT name, timestamp, mode, process, is_directory
        FROM opened_files
        ORDER BY id
        ''')
    binaries = set()
    files = set()
    edges = OrderedSet()

    # ... as well as executed files.
    exec_cursor = conn.cursor()
    exec_rows = exec_cursor.execute('''
        SELECT name, timestamp, process, argv
        FROM executed_files
        ORDER BY id
        ''')

    # Loop on all event lists
    logger.info("Getting all events from database...")
    rows = heapq.merge(((r[2], 'process', r) for r in process_rows),
                       ((r[1], 'open', r) for r in file_rows),
                       ((r[1], 'exec', r) for r in exec_rows))
    runs = []
    run = None
    for ts, event_type, data in rows:
        if event_type == 'process':
            r_id, r_parent, r_timestamp, r_thread = data
            logger.debug("Process %d created (parent %r)", r_id, r_parent)
            if r_parent is not None:
                parent = processes[r_parent]
                binary = parent.binary
            else:
                run = Run(len(runs))
                runs.append(run)
                parent = None
                binary = None
            if r_parent is not None:
                argv = processes[r_parent].argv
            else:
                argv = None
            process = Process(r_id, run, parent, r_timestamp, r_thread, False,
                              binary, argv,
                              C_INITIAL if r_parent is None else C_FORK)
            processes[r_id] = process
            all_programs.append(process)
            run.processes.append(process)

        elif event_type == 'open':
            r_name, r_timestamp, r_mode, r_process, r_directory = data
            r_name = normalize_path(r_name)
            logger.debug("File open: %s, process %d", r_name, r_process)
            if not (r_mode & FILE_WDIR or r_directory):
                process = processes[r_process]
                files.add(r_name)
                edges.add((process, r_name, r_mode, None))

        elif event_type == 'exec':
            r_name, r_timestamp, r_process, r_argv = data
            r_name = normalize_path(r_name)
            argv = tuple(r_argv.split('\0'))
            if not argv[-1]:
                argv = argv[:-1]
            logger.debug("File exec: %s, process %d", r_name, r_process)
            process = processes[r_process]
            binaries.add(r_name)
            # Here we split this process in two "programs", unless the previous
            # one hasn't done anything since it was created via fork()
            if not all_forks and not process.acted:
                process.binary = r_name
                process.created = C_FORKEXEC
                process.acted = True
                process.argv = argv
            else:
                process = Process(
                    process.pid,
                    run,
                    process,
                    r_timestamp,
                    False,
                    True,  # Hides exec only once
                    r_name,
                    argv,
                    C_EXEC)
                all_programs.append(process)
                processes[r_process] = process
                run.processes.append(process)
            files.add(r_name)
            edges.add((process, r_name, None, argv))

    process_cursor.close()
    file_cursor.close()
    exec_cursor.close()
    conn.close()

    return runs, files, edges
Exemple #40
0
import heapq

dataset_size = int(sys.argv[1])
data = []
for _ in range(dataset_size):
    data.append(random.randint(1, dataset_size))

start = MPI.Wtime()
comm = MPI.COMM_WORLD
thread_num = comm.size

new_list = []
for rank in range(thread_num):
    new_list = np.array_split(data, thread_num)

v = comm.scatter(new_list, 0)
for i in range(len(v)):
    for j in range(len(v) - i - 1):
        if v[j] > v[j + 1]:
            v[j], v[j + 1] = v[j + 1], v[j]
g = comm.gather(v, 0)

data_sorted = []
if comm.rank == 0:
    for i in range(len(g)):
        data_sorted = list(heapq.merge(data_sorted, g[i]))
    end = MPI.Wtime()
    print("Size of data: ", dataset_size)
    print("Number of threads: ", thread_num)
    print("Time: ", end - start)
    print("\n")
def merge_sort_file():
    with open('sorted_file1','rt') as file_1, \
        open('sorted_file2','rt') as file_2, \
        open('merged_file','rt') as out_f:
        for line in heapq.merge(file_1, file_2):
            out_f.write(line)
Exemple #42
0
import heapq

h1 = [2, 4, 6, 8]
h2 = [1, 3, 5, 7, 9]
l = list(heapq.merge(h1, h2))
print(l)
Exemple #43
0
def merge_sorted_arrays_pythonic(sorted_arrays):
    return list(heapq.merge(*sorted_arrays))
Exemple #44
0
 def _region_iterator(self, ctx):
     return self.ivl.cut_left_iter(
         merge(ctx.syms.left_edges(self.ivl), ctx.cmts.cuts(self.ivl)))
Exemple #45
0
    def aggregate_combined_blocks(
            blocks: List[Block[ArrowRow]], key: GroupKeyT,
            aggs: Tuple[AggregateFn]) -> Tuple[Block[ArrowRow], BlockMetadata]:
        """Aggregate sorted, partially combined blocks with the same key range.

        This assumes blocks are already sorted by key in ascending order,
        so we can do merge sort to get all the rows with the same key.

        Args:
            blocks: A list of partially combined and sorted blocks.
            key: The column name of key or None for global aggregation.
            aggs: The aggregations to do.

        Returns:
            A block of [k, v_1, ..., v_n] columns and its metadata where k is
            the groupby key and v_i is the corresponding aggregation result for
            the ith given aggregation.
            If key is None then the k column is omitted.
        """

        key_fn = (lambda r: r[r._row.schema.names[0]]
                  ) if key is not None else (lambda r: 0)

        iter = heapq.merge(
            *[ArrowBlockAccessor(block).iter_rows() for block in blocks],
            key=key_fn)
        next_row = None
        builder = ArrowBlockBuilder()
        while True:
            try:
                if next_row is None:
                    next_row = next(iter)
                next_key = key_fn(next_row)
                next_key_name = next_row._row.schema.names[
                    0] if key is not None else None

                def gen():
                    nonlocal iter
                    nonlocal next_row
                    while key_fn(next_row) == next_key:
                        yield next_row
                        try:
                            next_row = next(iter)
                        except StopIteration:
                            next_row = None
                            break

                # Merge.
                first = True
                accumulators = [None] * len(aggs)
                resolved_agg_names = [None] * len(aggs)
                for r in gen():
                    if first:
                        count = collections.defaultdict(int)
                        for i in range(len(aggs)):
                            name = aggs[i].name
                            # Check for conflicts with existing aggregation
                            # name.
                            if count[name] > 0:
                                name = ArrowBlockAccessor._munge_conflict(
                                    name, count[name])
                            count[name] += 1
                            resolved_agg_names[i] = name
                            accumulators[i] = r[name]
                        first = False
                    else:
                        for i in range(len(aggs)):
                            accumulators[i] = aggs[i].merge(
                                accumulators[i], r[resolved_agg_names[i]])
                # Build the row.
                row = {}
                if key is not None:
                    row[next_key_name] = next_key

                for agg, agg_name, accumulator in zip(aggs, resolved_agg_names,
                                                      accumulators):
                    row[agg_name] = agg.finalize(accumulator)

                builder.add(row)
            except StopIteration:
                break

        ret = builder.build()
        return ret, ArrowBlockAccessor(ret).get_metadata(None)
Exemple #46
0
    async def _update_function(
        self,
        instance_name: str,
        from_token: Token,
        current_token: Token,
        target_row_count: int,
    ) -> StreamUpdateResult:

        # the events stream merges together three separate sources:
        #  * new events
        #  * current_state changes
        #  * events which were previously outliers, but have now been de-outliered.
        #
        # The merge operation is complicated by the fact that we only have a single
        # "stream token" which is supposed to indicate how far we have got through
        # all three streams. It's therefore no good to return rows 1-1000 from the
        # "new events" table if the state_deltas are limited to rows 1-100 by the
        # target_row_count.
        #
        # In other words: we must pick a new upper limit, and must return *all* rows
        # up to that point for each of the three sources.
        #
        # Start by trying to split the target_row_count up. We expect to have a
        # negligible number of ex-outliers, and a rough approximation based on recent
        # traffic on sw1v.org shows that there are approximately the same number of
        # event rows between a given pair of stream ids as there are state
        # updates, so let's split our target_row_count among those two types. The target
        # is only an approximation - it doesn't matter if we end up going a bit over it.

        target_row_count //= 2

        # now we fetch up to that many rows from the events table

        event_rows = await self._store.get_all_new_forward_event_rows(
            from_token, current_token, target_row_count)  # type: List[Tuple]

        # we rely on get_all_new_forward_event_rows strictly honouring the limit, so
        # that we know it is safe to just take upper_limit = event_rows[-1][0].
        assert (len(event_rows) <= target_row_count
                ), "get_all_new_forward_event_rows did not honour row limit"

        # if we hit the limit on event_updates, there's no point in going beyond the
        # last stream_id in the batch for the other sources.

        if len(event_rows) == target_row_count:
            limited = True
            upper_limit = event_rows[-1][0]  # type: int
        else:
            limited = False
            upper_limit = current_token

        # next up is the state delta table.
        (
            state_rows,
            upper_limit,
            state_rows_limited,
        ) = await self._store.get_all_updated_current_state_deltas(
            from_token, upper_limit, target_row_count)

        limited = limited or state_rows_limited

        # finally, fetch the ex-outliers rows. We assume there are few enough of these
        # not to bother with the limit.

        ex_outliers_rows = await self._store.get_ex_outlier_stream_rows(
            from_token, upper_limit)  # type: List[Tuple]

        # we now need to turn the raw database rows returned into tuples suitable
        # for the replication protocol (basically, we add an identifier to
        # distinguish the row type). At the same time, we can limit the event_rows
        # to the max stream_id from state_rows.

        event_updates = (
            (stream_id, (EventsStreamEventRow.TypeId, rest))
            for (stream_id, *rest) in event_rows
            if stream_id <= upper_limit)  # type: Iterable[Tuple[int, Tuple]]

        state_updates = (
            (stream_id, (EventsStreamCurrentStateRow.TypeId, rest))
            for (stream_id,
                 *rest) in state_rows)  # type: Iterable[Tuple[int, Tuple]]

        ex_outliers_updates = ((stream_id, (EventsStreamEventRow.TypeId, rest))
                               for (stream_id, *rest) in ex_outliers_rows
                               )  # type: Iterable[Tuple[int, Tuple]]

        # we need to return a sorted list, so merge them together.
        updates = list(
            heapq.merge(event_updates, state_updates, ex_outliers_updates))
        return updates, upper_limit, limited
Exemple #47
0
print('\nOutput files:\n%s' %
      ('\n'.join([floci, fpctmet, fcntmet, ftotreads])))
## ----------------------------
## 1. Create union of positions
## ----------------------------

## See http://stackoverflow.com/questions/12460943/merging-pre-sorted-files-without-reading-everything-into-memory
print('\nGenerating union of all positions...')

#with input_fin[0] as f0, input_fin[1] as f1, input_fin[2] as f2, input_fin[3] as f3, input_fin[4] as f4, input_fin[5] as f5:
#with input_fin as sources:
#with fout_loci as dest:
decorated = [((extract_key(line), bdgLine_to_locusLine(line)) for line in f
              if line.strip() != '') for f in input_fin]
merged = heapq.merge(*decorated)
undecorated = imap(itemgetter(-1), merged)
thisline = ''
nlines = 0
print('Outputting')
for x in undecorated:
    if x != thisline:
        fout_loci.write(x + '\n')
        nlines += 1
        if nlines % 500000 == 0:
            print('%s lines to locus file' % (nlines))
#        if nlines > 50000:
#            break
    thisline = x
fout_loci.close()
for x in input_fin:
Exemple #48
0
def circle_around(x0, y0, end_x):
    """ Circle the cluster """
    # array with labeled bin clusters, xstart, ystart
    for dist_point in merge(*(distance_column(x0, x, y0)
                              for x in range(end_x))):
        yield dist_point
Exemple #49
0
import heapq
import random

random.seed(37)
data = []
for index in range(4):
    new_data = list(random.sample(range(1, 101), 5))
    new_data.sort()
    data.append(new_data)

for index, arr in enumerate(data):
    print('index:[{}], arr: {}'.format(index, arr))

# 不耗内存
for value in heapq.merge(*data):
    print(value)
Exemple #50
0
    def _do_one_inner_iteration(self):
        r"""
        Executes one inner iteration
        """
        self._logger.debug("    Inner Iteration")             
        # Fill throat and connecting pore
        # Pop out the largest throat (lowest Pcap) in the list, read the throat number
        tinvade = heapq.heappop(self.tpoints[self.current_cluster-1])[1]
        self._logger.debug( ' ')
        self._logger.debug( '--------------------------------------------------')
        self._logger.debug( 'STEP')
        self._logger.debug(self.tseq)
        self._logger.debug( 'trying to access cluster: ')
        self._logger.debug(self.current_cluster)
        self._logger.debug( 'when these clusters are active active: ')
        self._logger.debug(sp.nonzero(self.cluster_data['active'])[0])
        self._logger.debug( 'Haines at throat,time: ')
        self._logger.debug(tinvade)
        self._logger.debug(self.sim_time)

        # Mark throat as invaded
        self.Tsequence[tinvade] = self.tseq
        self.Ttime[tinvade] = self.sim_time
        # Remove throat's contribution to the vol_coef
        self.cluster_data['vol_coef'][self.current_cluster-1] = self.cluster_data['vol_coef'][self.current_cluster-1]-self.Tvol_coef[tinvade]
        # Mark pore as invaded
        Pores = self._net.get_connected_pores(tinvade)
        # If both pores are already invaded:
        if np.in1d(Pores,np.nonzero(self.Pinv)[0]).all():
            self.NewPore = -1
            # Label invaded throat with smaller cluster number
            #find cluster 1
            clusters = self.cluster_data['transform'][self.Pinv[Pores]-1]
            self._logger.debug('clusters = ')
            self._logger.debug(clusters)
            self.current_cluster = min(clusters)[0]
            self.Tinv[tinvade] = self.current_cluster
            # if pores are from 2 different clusters:
            if self.Pinv[Pores[0]]!=self.Pinv[Pores[1]] :
                # find name of larger cluster number                
                maxCluster = max(clusters)[0]
                self._logger.info(' ')
                self._logger.info('CLUSTERS COMBINING:')
                self._logger.info(self.current_cluster)
                self._logger.info(maxCluster)
                self._logger.info('at time')
                self._logger.info(self.sim_time)
                # update the cluster transform
                self.cluster_data['transform'][self.cluster_data['transform']==maxCluster] = [self.current_cluster][0]  
                # relabel all pores and throats from larger number with smaller number
                self.Pinv[np.where(self.Pinv==maxCluster)[0]] = self.current_cluster
                self.Tinv[np.where(self.Tinv==maxCluster)[0]] = self.current_cluster
                # append the list of throats for the other cluster to the current cluster                              
                self.tlists[self.current_cluster-1] = self.tlists[self.current_cluster-1] + self.tlists[maxCluster-1]
                # delete the throat lists on the other cluster     
                self.tlists[maxCluster-1] = []
                # merge the heaps of throat information
                self.tpoints[self.current_cluster-1] = list(heapq.merge(self.tpoints[self.current_cluster-1],self.tpoints[maxCluster-1]))
                # update the clusters' vol_coefs
                self.cluster_data['vol_coef'][self.current_cluster-1] += self.cluster_data['vol_coef'][maxCluster-1]
                self.cluster_data['vol_coef'][maxCluster-1] = 0  
                # update the clusters' pore volume
                self.cluster_data['pore_volume'][self.current_cluster-1] += self.cluster_data['pore_volume'][maxCluster-1]
                self.cluster_data['pore_volume'][maxCluster-1] = 0
                # update the clusters' flowrates
                self.cluster_data['flow_rate'][self.current_cluster-1] += self.cluster_data['flow_rate'][maxCluster-1]
                self.cluster_data['flow_rate'][maxCluster-1] = 0
                self._logger.debug( 'new flowrate for cluster ')
                self._logger.debug(self.current_cluster)
                self._logger.debug('is')
                self._logger.debug(self.cluster_data['flow_rate'][self.current_cluster-1])
                # check if either was inactive (broke through already)
                if self.cluster_data['active'][maxCluster-1] + self.cluster_data['active'][self.current_cluster-1]<2:
                    self._logger.debug('making clusters ')
                    self._logger.debug(self.current_cluster)
                    self._logger.debug('and')
                    self._logger.debug(maxCluster)
                    self._logger.debug('inactive due to one being inactive already')
                    self._logger.debug(self.cluster_data['active'][self.current_cluster-1])
                    self._logger.debug(self.cluster_data['active'][maxCluster-1])
                    self.cluster_data['active'][maxCluster-1] = 0
                    self.cluster_data['active'][self.current_cluster-1] = 0
                    self.cluster_data['haines_time'][self.current_cluster-1] = 100000000000000000000000000000000
                    self._logger.info(' ')
                    self._logger.info('CLUSTER MERGED WITH A BREAKTHROUGH CLUSTER')
                self._logger.info('making cluster ')
                self._logger.info(maxCluster)
                self._logger.info('inactive due to merge')
                # update the old cluster's activity and time
                self.cluster_data['haines_time'][maxCluster-1] = 100000000000000000000000000000000
                self.cluster_data['active'][maxCluster-1] = 0 
                # NO IDEA WHAT THIS LINE DOES PLEASE HELP MAHMOUD
                #self.tpoints[self.current_cluster-1] = list(k for k,v in itertools.groupby(self.tpoints[self.current_cluster-1]))
                self.tpoints[maxCluster-1] = []

        else:
            # label invaded throat with current cluster
            self.Tinv[tinvade] = self.current_cluster
            # find univaded pore, NewPore
            self.NewPore = Pores[self.Pinv[Pores][:,0]==0][0]
            self._logger.debug( ' ')            
            self._logger.debug( 'INVADING PORE: ')
            self._logger.debug(self.NewPore)
            self._logger.debug('the other pore is one of: ')
            self._logger.debug(Pores)
            self._logger.debug( 'position: ')
            self._logger.debug(self._net.pore_properties['coords'][self.NewPore])
            # label that pore as invaded
            self.Pinv[self.NewPore] = self.current_cluster
            self.Pinv_original[self.NewPore] = self.current_cluster
            self.Ptime[self.NewPore] = self.sim_time
            self.Psequence[self.NewPore] = self.tseq 
            # update self.cluster_data.['pore_volume']
            self.cluster_data['pore_volume'][self.current_cluster-1] += self._net.pore_properties['volume'][self.NewPore]
            # Make a list of all throats neighboring pores in the cluster
            # Update interface list        
            neighbors = self._net.get_neighbor_throats(self.NewPore)
            for j in neighbors:
                # If a throat is not labelled as invaded by the cluster, it must be an interfacial throat
                if (j not in self.tlists[self.current_cluster-1]):
                    self._logger.debug( 'new throat:')
                    self._logger.debug(j)
                    self._logger.debug('connecting pores:')
                    self._logger.debug(self._net.get_connected_pores(j))
                    # Add this throat data (pressure, number) to this cluster's "heap" of throat data.
                    heapq.heappush(self.tpoints[self.current_cluster-1],(self._net.throat_properties['Pc_entry'][j],j))
                    # Add new throat number to throat list for this cluster
                    self.tlists[self.current_cluster-1].append(j)
                    # Update the cluster's vol_coef
                    self.cluster_data['vol_coef'][self.current_cluster-1] = self.cluster_data['vol_coef'][self.current_cluster-1]+self.Tvol_coef[j]
        # Find next Haines Jump info
        # Make sure you are not re-invading a throat
        while self.Tinv[self.tpoints[self.current_cluster-1][0][1]] > 0:
            if self.tpoints[self.current_cluster-1] == []:
                self._logger.debug( 'making cluster ')
                self._logger.debug(self.current_cluster)
                self._logger.debug('inactive due to tpoints = [] ')
                self.cluster_data['active'][self.current_cluster-1] = 0
                self.cluster_data['haines_time'][self.current_cluster-1] = 100000000000000000000000000000000
                break
            tremove = heapq.heappop(self.tpoints[self.current_cluster-1])[1]
            self.cluster_data['vol_coef'][self.current_cluster-1] = self.cluster_data['vol_coef'][self.current_cluster-1]-self.Tvol_coef[tremove]
        next_throat = self.tpoints[self.current_cluster-1][0][1]
        self.cluster_data['haines_throat'][self.current_cluster-1] = next_throat
        self.cluster_data['haines_pressure'][self.current_cluster-1] = self.tpoints[self.current_cluster-1][0][0]
        self.cluster_data['cap_volume'][self.current_cluster-1] = self.cluster_data['haines_pressure'][self.current_cluster-1]*self.cluster_data['vol_coef'][self.current_cluster-1]
            
        # Calculate the new Haines jump time
        self._logger.debug( 'haines time before last stage:')
        self._logger.debug( self.cluster_data['haines_time'])
        if self.tpoints[self.current_cluster-1] == []:
            self._logger.debug('making cluster ')
            self._logger.debug(self.current_cluster)
            self._logger.debug('inactive due to self.tpoints being empty for that cluster')
            self.cluster_data['active'][self.current_cluster-1] = 0
            self.cluster_data['haines_time'][self.current_cluster-1] = 100000000000000000000000000000000
        if self.cluster_data['active'][self.current_cluster-1] == 1:
            self.cluster_data['haines_time'][self.current_cluster-1] = (self.cluster_data['pore_volume'][self.current_cluster-1]+self.cluster_data['cap_volume'][self.current_cluster-1])/self.cluster_data['flow_rate'][self.current_cluster-1]
        if self.cluster_data['haines_time'][self.current_cluster-1] < self.sim_time:
            self.cluster_data['haines_time'][self.current_cluster-1] = self.sim_time + 0.01
        self._logger.debug('haines time at the end of the throat stuff')
        self._logger.debug(self.cluster_data['haines_time'])
Exemple #51
0
    def get_records_sorted(self, ctx, filters, limit, marker, **kwargs):
        """Get a cross-cell list of records matching filters.

        This iterates cells in parallel generating a unified and sorted
        list of records as efficiently as possible. It takes care to
        iterate the list as infrequently as possible. We wrap the results
        in RecordWrapper objects so that they are sortable by
        heapq.merge(), which requires that the '<' operator just works.

        Our sorting requirements are encapsulated into the
        RecordSortContext provided to the constructor for this object.

        This function is a generator of records from the database like what you
        would get from instance_get_all_by_filters_sort() in the DB API.

        NOTE: Since we do these in parallel, a nonzero limit will be passed
        to each database query, although the limit will be enforced in the
        output of this function. Meaning, we will still query $limit from each
        database, but only return $limit total results.

        """

        if marker:
            # A marker identifier was provided from the API. Call this
            # the 'global' marker as it determines where we start the
            # process across all cells. Look up the record in
            # whatever cell it is in and record the values for the
            # sort keys so we can find the marker instance in each
            # cell (called the 'local' marker).
            global_marker_record = self.get_marker_record(ctx, marker)
            global_marker_values = [
                global_marker_record[key] for key in self.sort_ctx.sort_keys
            ]

        def do_query(ctx):
            """Generate RecordWrapper(record) objects from a cell.

            We do this inside the thread (created by
            scatter_gather_all_cells()) so that we return wrappers and
            avoid having to iterate the combined result list in the
            caller again. This is run against each cell by the
            scatter_gather routine.
            """

            # The local marker is an identifier of a record in a cell
            # that is found by the special method
            # get_marker_by_values(). It should be the next record
            # in order according to the sort provided, but after the
            # marker instance which may have been in another cell.
            local_marker = None

            # Since the regular DB query routines take a marker and assume that
            # the marked record was the last entry of the previous page, we
            # may need to prefix it to our result query if we're not the cell
            # that had the actual marker record.
            local_marker_prefix = []

            marker_id = self.marker_identifier

            if marker:
                # FIXME(danms): If we knew which cell we were in here, we could
                # avoid looking up the marker again. But, we don't currently.

                local_marker = self.get_marker_by_values(
                    ctx, global_marker_values)
                if local_marker:
                    if local_marker != marker:
                        # We did find a marker in our cell, but it wasn't
                        # the global marker. Thus, we will use it as our
                        # marker in the main query below, but we also need
                        # to prefix that result with this marker instance
                        # since the result below will not return it and it
                        # has not been returned to the user yet. Note that
                        # we do _not_ prefix the marker instance if our
                        # marker was the global one since that has already
                        # been sent to the user.
                        local_marker_filters = copy.copy(filters)
                        if marker_id not in local_marker_filters:
                            # If an $id filter was provided, it will
                            # have included our marker already if this
                            # instance is desired in the output
                            # set. If it wasn't, we specifically query
                            # for it. If the other filters would have
                            # excluded it, then we'll get an empty set
                            # here and not include it in the output as
                            # expected.
                            local_marker_filters[marker_id] = [local_marker]
                        local_marker_prefix = self.get_by_filters(
                            ctx,
                            local_marker_filters,
                            limit=1,
                            marker=None,
                            **kwargs)
                else:
                    # There was a global marker but everything in our
                    # cell is _before_ that marker, so we return
                    # nothing. If we didn't have this clause, we'd
                    # pass marker=None to the query below and return a
                    # full unpaginated set for our cell.
                    return []

            main_query_result = self.get_by_filters(ctx,
                                                    filters,
                                                    limit=limit,
                                                    marker=local_marker,
                                                    **kwargs)

            return (RecordWrapper(self.sort_ctx,
                                  inst) for inst in itertools.chain(
                                      local_marker_prefix, main_query_result))

        # NOTE(tssurya): When the below routine provides sentinels to indicate
        # a timeout on a cell, we ignore that cell to avoid the crash when
        # doing the merge below and continue merging the results from the 'up'
        # cells.
        # TODO(tssurya): Modify this to return the minimal available info from
        # the down cells.
        if self.cells:
            results = context.scatter_gather_cells(ctx, self.cells, 60,
                                                   do_query)
        else:
            results = context.scatter_gather_all_cells(ctx, do_query)
        for cell_uuid in list(results):
            if results[cell_uuid] in (context.did_not_respond_sentinel,
                                      context.raised_exception_sentinel):
                LOG.warning(
                    "Cell %s is not responding and hence skipped "
                    "from the results.", cell_uuid)
                results.pop(cell_uuid)

        # If a limit was provided, it was passed to the per-cell query
        # routines.  That means we have NUM_CELLS * limit items across
        # results. So, we need to consume from that limit below and
        # stop returning results.
        limit = limit or 0

        # Generate results from heapq so we can return the inner
        # instance instead of the wrapper. This is basically free
        # as it works as our caller iterates the results.
        for i in heapq.merge(*results.values()):
            yield i._db_record
            limit -= 1
            if limit == 0:
                # We'll only hit this if limit was nonzero and we just
                # generated our last one
                return
Exemple #52
0
# 输出: [4]
print (a)

data = [100,203, 1, 3, 5, 7, 9, 4, 6, 8, 0]
hq.heapify(data)
hq.heappop(data)
hq.heappushpop(data, 40000)
data
'''
归并排序
merge(*iterables)
'''

a = [2,4,6]
b = [1,3,5]
c = hq.merge(a, b)
# 输出: [1, 2, 3, 4, 5, 6]
list(c)

'''
nlargest(n, iterable[, key]), nsmallest(n, iterable[, key])
获取列表中最大、最小的几个值。
'''

a = [1, 3, 4, 2]
# 输出: [4, 3]
hq.nlargest(2, a)
# 输出: [1, 2]
hq.nsmallest(2, a)

Exemple #53
0
 def getNewsFeed(self, userId):
     tweets = heapq.merge(*(self.tweets[u] for u in self.followees[userId] | {userId}))
     return [t for _, t in itertools.islice(tweets, 10)]
Exemple #54
0
def kth_smallest_v1(mat, k):
    return list(merge(*mat))[k-1]
Exemple #55
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import config_default
from heapq import merge
from config_override import configs

configs = config_default.configs

try:
    import config_override
    configs = merge(configs, config_override.configs)
except ImportError:
    pass
from heapq import merge
p, q = map(int, input("Enter the size of the array").split())
a = list(map(int, input("enter array elements of A array").split()))
b = list(map(int, input("enter array elements of B array").split()))
c = list(merge(a, b))
print(*c)
Exemple #57
0
def merge(length, *sources):
    """Merge lists of lists.

    Each source produces (or contains) lists of ordered items.
    Items of each list must be greater or equal to all items of
    the previous list (that implies that items must be comparable).

    The function merges the sources into lists with the length
    equal to given one, except the last list which can be shorter.

    Example:
        it1 = iter([[1, 3, 5], [5, 7, 9, 14], [17, 21, 36, 41]])
        it2 = iter([[2, 2, 4], [9, 10], [16, 19, 23, 26, 91]])
        it3 = iter([[5], [5, 7, 11, 14, 14, 19, 23]])

        it = merge(10, it1, it2, it3)

        for i in it:
            print i

    prints out:
        [1, 2, 2, 3, 4, 5, 5, 5, 5, 7, 7, 9, 9, 10]
        [11, 14, 14, 14, 16, 17, 19, 19, 21, 23, 23]
        [26, 36, 41, 91]

    :param: length, length of generated lists, except the last one.
    :param: sources, generators that produce lists of items to merge
    """

    streams = [{"data": [], "gen": src} for src in sources]

    out_chunk = []
    while True:
        while len(out_chunk) < length:

            # Least right item among streams
            lri = None

            # Refresh data if needed
            for s in streams:
                if s["gen"] and not s["data"]:
                    try:
                        while not s["data"]:
                            s["data"] = next(s["gen"])
                    except StopIteration:
                        s["gen"] = None

                # ... and define least right item
                if s["data"]:
                    rightmost_item = s["data"][-1]
                    if (lri is None) or (rightmost_item < lri):
                        lri = rightmost_item

            # No more data to merge
            if lri is None:
                break

            to_merge = []
            for s in streams:
                if s["data"]:
                    pos = bisect.bisect_right(s["data"], lri)
                    to_merge.append(s["data"][:pos])
                    s["data"] = s["data"][pos:]

            out_chunk += heapq.merge(*to_merge)

        if out_chunk:
            if len(out_chunk) > length:
                yield out_chunk[:length]
                out_chunk = out_chunk[length:]
            else:
                yield out_chunk
                out_chunk = []
        else:
            return
Exemple #58
0
 def _merge_time_lists(self, list1, list2):
     return [x for x in heapq.merge(list1, list2, key=lambda r: r[0])]
Exemple #59
0
 def __iter__(self):
     return heapq.merge(*self.iterables, key=self.key)
def merge_pesr_depth(pesr_vcf, depth_vcf, frac=0.8):
    # Memory inefficient but it's easier and shouldn't matter too much
    # now that the variants have been filtered down
    records = dict()
    records['pesr'] = {record.id: record for record in pesr_vcf}
    records['depth'] = {record.id: record for record in depth_vcf}

    # Wipe MEMBERS from prior clustering
    for source in 'pesr depth'.split():
        for ID, record in records[source].items():
            record.info['MEMBERS'] = [ID]

    # Reset for bedtool creation
    pesr_vcf.reset()
    depth_vcf.reset()
    pesr_bed = svu.vcf2bedtool(pesr_vcf, split_bnd=False,
                               include_strands=False)
    depth_bed = svu.vcf2bedtool(depth_vcf, split_bnd=False,
                                include_strands=False)

    # Merge depth records with PE/SR records if they share 80% recip overlap
    sect = pesr_bed.intersect(depth_bed, wa=True, wb=True, r=True, f=frac)

    filtered_depth_IDs = deque()
    for pair in sect.intervals:
        # Check SV types match
        if pair.fields[4] != pair.fields[9]:
            continue

        pesr_id, depth_id = pair.fields[3], pair.fields[8]

        # Add depth record's samples to PE/SR
        filtered_depth_IDs.append(depth_id)
        pesr_record = records['pesr'][pesr_id]
        depth_record = records['depth'][depth_id]

        # Update metadata and samples
        pesr_record.info['MEMBERS'] = (pesr_record.info['MEMBERS'] +
                                       (depth_record.id, ))
        pesr_record.info['SOURCES'] = pesr_record.info['SOURCES'] + ('depth', )
        add_samples(pesr_record, depth_record)

    # Remove overlapping depth records (not performed in for loop to account
    # for double overlaps
    # TODO: handle double overlap of depth calls
    for ID in set(filtered_depth_IDs):
        records['depth'].pop(ID)

    # In remaining depth-only calls, add samples to PE/SR record if the
    # record covers 90% of the depth-only call.
    sect = pesr_bed.intersect(depth_bed, wa=True, wb=True, F=0.9)

    for pair in sect.intervals:
        # Check SV types match
        if pair.fields[4] != pair.fields[9]:
            continue

        pesr_id, depth_id = pair.fields[3], pair.fields[8]

        # Skip depth records we already added with 80% reciprocal
        if depth_id in filtered_depth_IDs:
            continue

        # If sample is in both depth record and pe/sr record, remove it from
        # depth record
        depth_record = records['depth'][depth_id]
        pesr_record = records['pesr'][pesr_id]

        merge_nested_depth_record(pesr_record, depth_record)

    # Merge records together
    def _sort_key(record):
        return (record.chrom, record.pos, record.info['CHR2'], record.stop)

    pesr_records = sorted(records['pesr'].values(), key=_sort_key)
    depth_records = sorted(records['depth'].values(), key=_sort_key)
    for record in heapq.merge(pesr_records, depth_records, key=_sort_key):
        # Clean out unwanted format keys
        for key in record.format.keys():
            if key != 'GT':
                del record.format[key]

        record.info['SOURCES'] = sorted(set(record.info['SOURCES']))
        record.info['MEMBERS'] = sorted(set(record.info['MEMBERS']))

        # Skip emptied depth records
        if len(svu.get_called_samples(record)) == 0:
            continue

        yield record