def test_union():
    temp = SortedSet(range(0, 50), load=7)
    that = SortedSet(range(50, 100), load=9)
    result = temp.union(that)
    assert all(result[val] == val for val in range(100))
    assert all(temp[val] == val for val in range(50))
    assert all(that[val] == (val + 50) for val in range(50))
Example #2
0
def parseKeywords(
        question_text):  #takes question raw text string -> set of keywords
    #   rareWords: words that don't show up in first 5000 (subject to change) most common words
    #   capWords: all capitalized words that aren't at the start of the sentence
    commonWords = pickle.load(open('./common_word_removal/commonWords.p',
                                   'rb'))  #top 7500 words
    trivialWords = pickle.load(
        open('./common_word_removal/trivialWords.p',
             'rb'))  #top 48 words plus prompt and accept
    #formattedQuestion =  re.sub("[\(\[].*?[\)\]]" , " ", question_text).translate( None , "0123456789" ).replace("\n" , " ")  #regex gets rid of all things in [], like pronunciation
    formattedQuestion = re.sub(
        "[\(\[].*?[\)\]]", " ", question_text).translate({
            ord(c): None
            for c in '0123456789'
        }).replace(
            "\n", " ")  #regex gets rid of all things in [], like pronunciation
    rareWords = SortedSet([
        word for word in formattedQuestion.translate(
            {ord(c): None
             for c in string.punctuation}).strip().split(" ")
        if (word not in commonWords and word.islower())
    ])
    capWords = SortedSet(
        word
        for word in re.findall(r'(?<!\.\s)\b[A-Z][a-z]*\b', formattedQuestion)
        if word.lower() not in trivialWords)

    return capWords.union(rareWords)
def test_union():
    temp = SortedSet(range(0, 50), load=7)
    that = SortedSet(range(50, 100), load=9)
    result = temp.union(that)
    assert all(result[val] == val for val in range(100))
    assert all(temp[val] == val for val in range(50))
    assert all(that[val] == (val + 50) for val in range(50))
Example #4
0
def get_links(names, html):
    """
    Return a SortedSet of computer scientist names that are linked from this
    html page. The return set is restricted to those people in the provided
    set of names.  The returned list should contain no duplicates.

    Params:
      names....A SortedSet of computer scientist names, one per filename.
      html.....A string representing one html page.
    Returns:
      A SortedSet of names of linked computer scientists on this html page, restricted to
      elements of the set of provided names.

    >>> get_links({'Gerald_Jay_Sussman'},
    ... '''<a href="/wiki/Gerald_Jay_Sussman">xx</a> and <a href="/wiki/Not_Me">xx</a>''')
    SortedSet(['Gerald_Jay_Sussman'], key=None, load=1000)
    """
    ###TODO
    found = SortedSet()
    bs = BeautifulSoup(html, "html.parser")
    for link in bs.find_all('a'):
        if link.get('href'):
            href = link.get('href')
            nl = re.split('|'.join(['/', '=', '&', ':', '/d+_']), href)
            cleared_nl = [n for n in nl if n != '']
            found = found.union([name for name in names if name in cleared_nl])
    return found
    pass
Example #5
0
def Apriori(Db,I, min_sup):
	'''
	This is the main Apriori function that calclates all frequent itemsets from Db( set of Transactons).
	:param Db: set of Transactons
	:param I:  set of items used.
	:param min_sup: minimum support
	:return: The set of Itemset objects
	'''

	min_sup = len(Db)*min_sup

	L = first_freq_itemset(Db, I, min_sup)
	i = 2

	final_L = SortedSet(L, key= lambda x: (len(x.set._list),x.set._list))
	while len(L)!= 0:
		C = apriori_gen(Db, L, I, min_sup)
		L = SortedSet([], key = lambda x: (len(x.set._list),x.set._list))
		for itemset_c in C:
			coun = 0
			for trans_d in Db:
				if trans_d.is_superset_of(itemset_c, is_proper=False):
					coun+=1
			if coun >= min_sup:
				L.add(itemset_c)



		i+=1
		final_L = final_L.union(L)

	return final_L
Example #6
0
def read_data(st):
    '''

	:param st:
	:return:
	'''

    fileo = open(st, 'r+')
    lis_of_lis = fileo.readlines()
    Db_lis = [
        set([int(itemm) for itemm in item.rstrip().lstrip().split(' ')])
        for item in lis_of_lis
    ]
    I_set = SortedSet([])
    for item in Db_lis:
        I_set = I_set.union(SortedSet(item))
    I = list(I_set)
    pass
    Db_lis = [
        Transaction(seth=SortedSet(item), Tid=num)
        for num, item in enumerate(Db_lis)
    ]
    Db = SortedSet(Db_lis, key=lambda x: x.Tid)
    I = SortedSet([i for i in range(100)])
    return Db, I
def expand_episode_list(str_list):
    episodes_to_download = SortedSet()
    episodes_split = str_list.strip().split(Constants.COMMA)
    for i in range(len(episodes_split)):
        ep_str = episodes_split[i].strip()
        if ep_str.isdigit():
            episodes_to_download.add(int(ep_str))
        else:
            hyp = resolve_hyphenated_range(ep_str)
            episodes_to_download = episodes_to_download.union(hyp)
    return episodes_to_download
    def getSigmondInputs(self):

        sigmond_inputs = list()

        all_operator_set_ops = SortedSet()
        if self.operator_sets:
            all_operator_set_ops = SortedSet.union(
                *[op_set.operators for op_set in self.operator_sets])

        for channel in self.channels:
            operators = [
                op for op in self.data_handler.getChannelOperators(channel)
                if op not in all_operator_set_ops
                and op not in self.excluded_operators
            ]
            if not operators:
                continue

            data_files = self.data_files + self.data_handler.getChannelDataFiles(
                channel)
            project_name = self.project_name(repr(channel))
            logfile = self.logfile(repr(channel))
            inputfile = self.inputfile(repr(channel))
            sigmond_input = self.new_sigmond_input(project_name, inputfile,
                                                   logfile, data_files)

            self.insertSigmondPlotTasks(sigmond_input, repr(channel),
                                        operators)

            sigmond_input.write()
            sigmond_inputs.append(sigmond_input)

        for operator_set in self.operator_sets:
            operators = operator_set.operators
            data_files = self.data_files
            for channel in operator_set.channels:
                channel_data_files = self.data_handler.getChannelDataFiles(
                    channel)
                data_files += self.data_handler.getChannelDataFiles(channel)

            project_name = self.project_name(operator_set.name)
            logfile = self.logfile(operator_set.name)
            inputfile = self.inputfile(operator_set.name)
            sigmond_input = self.new_sigmond_input(project_name, inputfile,
                                                   logfile, data_files)

            self.insertSigmondPlotTasks(sigmond_input, operator_set.name,
                                        operators)

            sigmond_input.write()
            sigmond_inputs.append(sigmond_input)

        return sigmond_inputs
Example #9
0
class Kernel:

    def __init__(self, index, lr_items, grammar):
        self.index = index
        self.lr_items = lr_items
        self.closure = SortedSet()
        self.closure = self.closure.union(lr_items)
        self.gotos = {}
        self.keys = set()
        self.compute_closure(grammar)

    def compute_closure(self, grammar):
        stack = deque()
        stack.extend(self.closure)
        while len(stack) > 0:
            actualElement = stack.pop()
            new_items = actualElement.get_populated_items(grammar)
            for item in new_items:
                if item not in self.closure:
                    self.closure.add(item)
                    stack.append(item)

    def get_detailed_string(self):
        result = str(self.index) + '. closure { ' + ', '.join("{}".format(str(item)) for item in self.lr_items) + ' } = { ' + ', '.join("{}".format(str(item)) for item in self.closure) + ' }'
        return result

    def __eq__(self, other):
        if other == None or self.lr_items != other.lr_items:
            return False
        return True

    def __str__(self):
        return 'closure { ' + ', '.join("{}".format(super(LALRItem, item).__str__()) for item in self.lr_items) + ' } = { ' + ', '.join("{}".format(super(LALRItem, item).__str__()) for item in self.closure) + ' }'

    def __hash__(self):
        return hash(str(self))

    def __lt__(self, other):
        if other == None:
            return False
        return str(self) < str(other)
Example #10
0
class Hamming(object):

    def __init__(self, iterable=None, tnp=None, key=''):
        """Inits Hamming class
        Args:
            tnp: total permissions in already sorted order
            it is best to have this class generate it for you.
        """
        # _all holds all permissions in string
        self._all = tnp or set([])
        self.key = None
        # key assciociated to hamming
        self._set_key(key)


    @staticmethod
    def _sum_cols(m):
        """Sum of all the columns in a matrix"""
        return [sum(x) for x in zip(*m)]

    @staticmethod
    def _hamming_distance(s1, s2):
        # carinality of sets must be equal
        return sum(ch1 != ch2 for ch1, ch2 in zip(s1, s2))

    @staticmethod
    def _nCr(n, r):
        """Number of iterations that will take place for
        a given number of combinations and cardinality

        Args:
            n: cardinality of set
            r: number of combinations

        return:
           number of iterations
        """
        f = math.factorial
        return f(n) / f(r) / f(n-r)

    def _set_key(self, key):
        """Set the key to be used when iterating though iterable"""
        if key is None and self.key is None:
            # iterable will be treated as iterable[i]
            # e.g. ["elem", "elem", "elem"]
            raise Exception("Not yet implemented.")
            # logging.info('Assuming matrix')
        elif key is not None:
            # iterable will be treated as iterable[i]['key']
            # e.g. [{'key':[]}, {'key':[]}, {'key':[]}]
            self.key = key
        elif key is None and self.key is not None:
            # if the key is already set and we are passing
            # none then ignore it.
            pass
        else:
            logging.warning("key is in unknown state")


    def _overlay(self, values_array):
        """overlay is the same as a boolean meatrix data structure
        every value is checked and outputted in 1d list
        which a[i] represents presents of x
        """
        #return ''.join('1' if x in values_array else '0' for x in self._all)
        return [1 if x in values_array else 0 for x in self._all]


    def get_list(self):
        return [p for p in self._all]

    def map_names(self, arr):
        """Maps HammingSet to its named key"""
        if self._all is None:
            raise Exception("Class does not contian permissions")
        return SortedDict(zip(self._all, arr))


    def accumulate(self, iterable, key=None):
        """Accumulate permissions and, transform

        Args:
            key: needs to be a key of iterable[i] which
            also needs to be iterable to be able to generate
            a set of unique elements for all iterable[i][key].
            For example:
                iterable[0]['key'] = ["val", "other"]
        """
        self._set_key(key)

        tmp = set.union(*(set(x[self.key]) for x in iterable))
        self._all = SortedSet(self._all.union(tmp))


    def bin_transform(self, iterable, key=None, out_file=None):
        """Take a collection of objects. Returns a new iterable.

        Args:
            key: needs to be a key of iterable[i] which
            also needs to be iterable to be able to generate
            a set of unique elements for all iterable[i][key].
            For example:
                iterable[0]['key'] = ["val", "other"]
        """
        self._set_key(key)
        che = copy.deepcopy(iterable)

        return self._bin_t(che)

    def bin_transform_inplace(self, iterable, key=None, out_file=None):
        """Take a collection of objects. Changes iterable in place.

        Args:
            key: needs to be a key of iterable[i] which
            also needs to be iterable to be able to generate
            a set of unique elements for all iterable[i][key].
            For example:
                iterable[0]['key'] = ["val", "other"]
        """
        # not needed as iterable will be changed
        return self._bin_t(iterable)


    def _bin_t(self, iterable):
        # NOTE: this is manipulating the arguments VALUE.
        # not needed but realize a deep copy may be needed
        # inorder to preserve iterable
        for obj in iterable:
            # bool transform
            obj[self.key] = self._overlay(obj[self.key])

        # Not needed since the iterable is being changed!!!
        return iterable


    def sums(self, iterable, key=None):
        """Sum of all m's (m by n matrix). aka vertical sum. This iters
        over generated boolean matricies and counts... if you sum your
        objects and this number is different (esp. less) then you have
        duplicate items.

        Returns:
            A 1 by n matrix such that each column is the sum
            of all previous rows in that column. For example
            sum([[1, 2], [2, 3]])
            = [3, 5]
        """
        self._set_key(key)

        totals = []
        # get all the objects and create matrix
        for obj in iterable:
            totals.append(obj[self.key])

        return self._sum_cols(totals)

    def hamming_dist(self, iterable, threshhold, key=None):
        """Get the hamming distance of objects

        Args:
            threshhold: is the number of objects to use
            when averaging the hamming distance.
        """
        self._set_key(key)

        if threshhold == None:
            threshhold = len(iterable)
        elif threshhold > len(iterable):
            raise IndexError("Threshhold is too large")

        _sm = iterable
        if not threshhold is None:
            # take threshhold from array
            _sm = _sm[:threshhold]

        # get arrays
        _xx = [x[self.key] for x in _sm]

        # generate combinations
        _c = itertools.combinations(_xx, 2)
        # get number of combinations
        # iterations = self._nCr(threshhold, 2)

        dist = 0.0
        # more efficient to just add 1
        list_len = 0

        for a in _c:
            dist += self._hamming_distance(a[0], a[1])
            list_len += 1
        return dist/list_len
Example #11
0
class Selection(IMutableGSlice):
    def __init__(
            self,
            universe: slice,
            revealed: list = None,
            intervals: Iterator = None,
            _length: Optional[int] = None  # For performance
    ):
        #assert isinstance(universe, slice)  # Should universe even be visible/exist?
        #assert universe.start == 0
        #assert isinstance(universe.stop, int)
        #assert universe.stop >= 1  # TODO Do we need this?
        self.universe = universe
        if intervals is None and revealed is None:
            self._intervals = self.revealed2sortedset([slice(0, universe.stop)])
        elif intervals is not None:
            self._intervals = SortedSet(intervals)
        else:
            self._intervals = self.revealed2sortedset(revealed)
        self._revealed_count = _length if isinstance(_length, int) else Selection._compute_len(self._intervals)

    @staticmethod
    def revealed2sortedset(revealed: List[Union[tuple, slice]]) -> SortedSet:
        """ Converts a list of included pairs to a sorted set of integers in O(n), n = size of @slices.
        Every number from every slice is added to the sorted set, except 0.
        """
        # 10, [] -> 10, []
        # 10, [(0, 10)] -> 10, [10]
        # 10, [(0, 7)] -> 10, [7]
        # 10, [(7, 10)] -> 10, [7, 10]
        # 10, [(3, 7)] -> 10, [3, 7]
        # 10, [(0, 3), (7, 10)] -> 10, [3, 7, 10]
        # 10, [(0, 1), (2, 3), (4, 5), (6, 7), (8, 9)] -> 10, [1, 2, 3, 4, 5, 6, 7, 8, 9]

        try:
            #intervals = SortedSet(a for a, _ in revealed).union(b for _, b in revealed)
            intervals = SortedSet()
            for a, b in revealed:
                intervals.add(a)
                intervals.add(b)
        except TypeError:  # slice
            intervals = SortedSet(sl.start for sl in revealed).union(sl.stop for sl in revealed)
        if 0 in intervals:
            intervals.remove(0)
        return intervals

    @staticmethod
    def sortedset2slices(sortedset: SortedSet) -> List[slice]:
        """ Converts a sorted set of integers to a list of included slices in O(n), n = size of @sortedset.
        If there is an even number of elements in @sortedset, the first slice is formed by the first and second
        numbers, the second slice is formed by the third and fourth numbers, and so on.
        If there is an odd number of elements in @sortedset, the pair consisting of the number 0 and the first element
        in @sortedset becomes the first slice in the output list. The remaining slices, if any, are formed by the
        second and third numbers, the fourth and fifth numbers, and so on.
        """
        slices = []
        if len(sortedset) % 2 == 0:
            for i in range(0, len(sortedset), 2):
                slices.append(slice(sortedset[i], sortedset[i + 1]))
        else:
            slices.append(slice(0, sortedset[0]))
            for i in range(1, len(sortedset), 2):
                slices.append(slice(sortedset[i], sortedset[i + 1]))
        return slices

    def slices(self) -> List[slice]:
        return self.sortedset2slices(self._intervals)

    def pairs(self) -> Iterator[Tuple[int, int]]:
        if len(self._intervals) % 2 == 0:
            return zip(self._intervals[::2], self._intervals[1::2])
        return itertools.chain([(0, self._intervals[0])], zip(self._intervals[1::2], self._intervals[2::2]))

    def gap_pairs(self) -> Iterator[Tuple[int, int]]:
        return self.complement().pairs()

    def intervals(self):
        return self._intervals

    def exclude(self, from_index: Optional[int], to_index: Optional[int]):
        original_length = self._revealed_count
        if isinstance(from_index, int) and -self.universe.stop <= from_index < 0:
            from_index = from_index % self.universe.stop
        if isinstance(to_index, int):
            if to_index > self.universe.stop:
                return self.exclude(from_index, None)
            if -self.universe.stop <= to_index < 0:
                to_index = to_index % self.universe.stop
        assert from_index is None or self.universe.start <= from_index <= self.universe.stop
        assert to_index is None or self.universe.start <= to_index <= self.universe.stop
        if from_index is None:
            from_index = self.universe.start
        if to_index is None:
            to_index = self.universe.stop
        if len(self._intervals) == 0:
            return 0
        if from_index >= to_index:
            return 0

        m = self._intervals.bisect_right(from_index)
        n = self._intervals.bisect_right(to_index)

        try:
            from_index_index = self._intervals.index(from_index)
        except ValueError:
            from_index_index = None
        try:
            to_index_index = self._intervals.index(to_index)
        except ValueError:
            to_index_index = None
        from_index_is_included = (
            len(self._intervals) % 2 == 0 and m % 2 == 1 or len(self._intervals) % 2 == 1 and m % 2 == 0)
        to_index_is_included = (
            len(self._intervals) % 2 == 0 and n % 2 == 1 or len(self._intervals) % 2 == 1 and n % 2 == 0)
        from_index_is_leftmost_included = from_index == 0 and from_index_is_included or from_index_index is not None and (
                len(self._intervals) % 2 == 0 and from_index_index % 2 == 0
                or len(self._intervals) % 2 == 1 and (from_index == 0 or from_index_index % 2 == 1))
        to_index_right_of_excluded = to_index_index is not None and (
                len(self._intervals) % 2 == 0 and to_index_index % 2 == 1
                or len(self._intervals) % 2 == 1 and (to_index == 0 or to_index_index % 2 == 0))

        if from_index_is_included:
            if from_index_is_leftmost_included:
                if to_index_is_included:
                    if m == 0:
                        to_remove = self._intervals[m:n]
                        endpoint = 0 if n == 0 else self._intervals[n - 1]
                        addendum = 0 if n == 0 else self._intervals[0]
                        self._revealed_count -= (to_index - endpoint) + addendum + sum(
                            b - a for a, b in zip(to_remove[1:-1:2], to_remove[2:-1:2]))
                        del self._intervals[m:n]
                        self._intervals.add(to_index)
                    else:
                        intermediates = self._intervals[m + 1:n - 1]
                        from_start, from_end = self._intervals[m - 1], self._intervals[m]
                        to_start, to_end = self._intervals[n - 1], self._intervals[n]
                        if m == n:
                            self._revealed_count -= to_index - from_start
                            self._intervals.remove(from_start)
                            self._intervals.add(to_index)
                        else:
                            self._revealed_count -= (from_end - from_start) + (to_index - self._intervals[n - 1]) + (
                                from_index - from_start) + sum(
                                b - a for a, b in zip(intermediates[::2], intermediates[1::2]))
                            del self._intervals[m + 1:n - 1]  # intermediates
                            self._intervals.remove(from_start)
                            self._intervals.remove(from_end)
                            self._intervals.remove(to_start)
                            self._intervals.add(to_index)
                else:
                    from_start = 0 if m == 0 else self._intervals[m - 1]
                    from_end = self._intervals[m]
                    self._revealed_count -= from_end - from_start
                    if from_start > 0:
                        self._intervals.remove(from_start)
                    self._intervals.remove(from_end)
            else:
                if to_index_is_included:
                    from_end = self._intervals[m]
                    to_start = self._intervals[n - 1]
                    if m == n:
                        self._revealed_count -= to_index - from_index
                        if from_index > 0:
                            self._intervals.add(from_index)
                        self._intervals.add(to_index)
                    else:
                        intermediates = self._intervals[m + 1:n - 1]
                        self._revealed_count -= (from_end - from_index) + (to_index - to_start) + sum(
                            b - a for a, b in zip(intermediates[::2], intermediates[1::2]))
                        del self._intervals[m + 1:n - 1]  # intermediates
                        if from_index > 0:
                            self._intervals.add(from_index)
                        self._intervals.add(to_index)
                        self._intervals.remove(from_end)
                        self._intervals.remove(to_start)
                else:
                    to_remove = self._intervals[m:n]
                    self._revealed_count -= self._intervals[m] - from_index + sum(b - a for a, b in zip(to_remove[1::2], to_remove[::2]))
                    del self._intervals[m:n]
                    if from_index != 0:
                        self._intervals.add(from_index)
        else:
            if to_index_is_included:
                if to_index_right_of_excluded:
                    to_remove = self._intervals[m:n - 1]
                    del self._intervals[m:n - 1]
                    self._revealed_count -= sum(b - a for a, b in zip(to_remove[::2], to_remove[1::2]))
                else:
                    to_remove = self._intervals[m:n]
                    del self._intervals[m:n]
                    self._intervals.add(to_index)
                    self._revealed_count -= (to_index - to_remove[0]) + sum(b - a for a, b in zip(to_remove[1::2], to_remove[::2]))
            else:
                to_remove = self._intervals[m:n]
                del self._intervals[m:n]
                self._revealed_count -= sum(b - a for a, b in zip(to_remove[::2], to_remove[1::2]))

        return original_length - self._revealed_count

    def exclude_virtual(self, from_index: Optional[int], to_index: Optional[int]):
        if from_index is None or from_index < -len(self) or from_index >= len(self):
            p_from_index = None
        else:
            p_from_index = self.virtual2physical(from_index)
        if to_index is None or to_index < -len(self) or to_index >= len(self):
            p_to_index = None
        else:
            p_to_index = self.virtual2physical(to_index)
        return self.exclude(p_from_index, p_to_index)

    def include(self, from_index: Optional[int], to_index: Optional[int]):
        original_length = len(self)
        if isinstance(from_index, int) and -self.universe.stop <= from_index < 0:
            from_index = from_index % self.universe.stop
        if isinstance(to_index, int):
            if to_index > self.universe.stop:
                return self.include(from_index, None)
            if -self.universe.stop <= to_index < 0:
                to_index = to_index % self.universe.stop
        assert from_index is None or self.universe.start <= from_index <= self.universe.stop
        assert to_index is None or self.universe.start <= to_index <= self.universe.stop
        if from_index is None:
            from_index = self.universe.start
        if to_index is None:
            to_index = self.universe.stop
        if not self._intervals:
            if from_index > 0:
                self._intervals.add(from_index)
            self._intervals.add(to_index)
            self._revealed_count += to_index - from_index
            return to_index - from_index
        if from_index == to_index:
            return 0

        m = self._intervals.bisect_right(from_index)
        n = self._intervals.bisect_right(to_index)

        try:
            from_index_index = self._intervals.index(from_index)
        except ValueError:
            from_index_index = None

        from_index_is_included = (
                len(self._intervals) % 2 == 0 and m % 2 == 1 or len(self._intervals) % 2 == 1 and m % 2 == 0)
        to_index_is_included = (
                len(self._intervals) % 2 == 0 and n % 2 == 1 or len(self._intervals) % 2 == 1 and n % 2 == 0)
        from_index_right_of_included = from_index_index is not None and (
                len(self._intervals) % 2 == 0 and from_index_index % 2 == 1
                or len(self._intervals) % 2 == 1 and from_index_index % 2 == 0)

        if from_index_is_included:
            if to_index_is_included:
                to_remove = self._intervals[m:n]
                del self._intervals[m:n]
                self._revealed_count += sum(b - a for a, b in zip(to_remove[::2], to_remove[1::2]))
            else:
                to_remove = self._intervals[m:n]
                del self._intervals[m:n]
                self._intervals.add(to_index)
                self._revealed_count += (to_index - to_remove[-1]) + sum(b - a for a, b in zip(to_remove[1::2], to_remove[::2]))
        else:
            if to_index_is_included:
                if from_index_right_of_included:
                    to_remove = self._intervals[m - 1:n]
                    del self._intervals[m - 1:n]
                    self._revealed_count += sum(b - a for a, b in zip(to_remove[::2], to_remove[1::2]))
                else:
                    to_remove = self._intervals[m:n]
                    del self._intervals[m:n]
                    self._intervals.add(from_index)
                    self._revealed_count += (to_remove[0] - from_index) + sum(b - a for a, b in zip(to_remove[1::2], to_remove[::2]))
            else:
                if from_index_right_of_included:
                    intermediates = self._intervals[m:n]
                    del self._intervals[m:n]  # intermediates
                    self._intervals.remove(from_index)
                    self._intervals.add(to_index)
                    self._revealed_count += (to_index - from_index) - sum(b - a for a, b in zip(intermediates[::2], intermediates[1::2]))
                else:
                    to_remove = self._intervals[m:n]
                    del self._intervals[m:n]
                    if from_index > 0:
                        self._intervals.add(from_index)
                    self._intervals.add(to_index)
                    self._revealed_count += (to_index - from_index) - sum(b - a for a, b in zip(to_remove[::2], to_remove[1::2]))

        return len(self) - original_length

    def include_partially(self, from_index: Optional[int], to_index: Optional[int], count: Union[int, tuple]):
        if isinstance(count, int):
            return self.include_partially(from_index, to_index, (count, count))
        head_count, tail_count = count
        head_revealed_count = self._include_partially_from_left(from_index, to_index, head_count)
        tail_revealed_count = self._include_partially_from_right(from_index, to_index, tail_count)
        return head_revealed_count + tail_revealed_count

    def _include_partially_from_left(self, from_index: int, to_index: int, count: int):
        if count == 0:
            return 0
        from_index, to_index = self._normalized_range(from_index, to_index)
        subsel = self._spanning_subslice(from_index, to_index).complement().subslice(from_index, to_index)

        revealed_count = 0
        for covered_start, covered_stop in subsel.pairs():
            coverage = covered_stop - covered_start
            if revealed_count + coverage < count:
                self.include(covered_start, covered_stop)
                revealed_count += coverage
            else:
                self.include(covered_start, covered_start + count - revealed_count)
                revealed_count = count
                break
        return revealed_count

    def _include_partially_from_right(self, from_index: int, to_index: int, count: int):
        if count == 0:
            return 0
        from_index, to_index = self._normalized_range(from_index, to_index)
        subsel = self._spanning_subslice(from_index, to_index).complement().subslice(from_index, to_index)

        revealed_count = 0
        for covered_start, covered_stop in reversed(list(subsel.pairs())):
            coverage = covered_stop - covered_start
            if revealed_count + coverage < count:
                self.include(covered_start, covered_stop)
                revealed_count += coverage
            else:
                self.include(covered_stop - (count - revealed_count), covered_stop)
                revealed_count = count
                break
        return revealed_count

    def include_expand(self, from_index: Optional[int], to_index: Optional[int], count: Union[int, Tuple[int, int]]):
        if isinstance(count, int):
            return self.include_expand(from_index, to_index, (count, count))
        if count == (0, 0):
            return 0
        head_count, tail_count = count
        revealed_counter = 0
        gaps = self.complement().subslice(from_index, to_index)
        for a, b in gaps.pairs():
            if b < self.universe.stop:
                revealed_counter += self._include_partially_from_right(a, b, head_count)
            if a > self.universe.start:
                revealed_counter += self._include_partially_from_left(a, b, tail_count)
        return revealed_counter

    def _previous_slice(self, sl: slice):
        """ :return The revealed or covered slice immediately to the left of @sl.
        :raise ValueError if there is none. """
        if sl.start == self.universe.start:
            raise ValueError("There is no slice to the left of {}.".format(sl))
        # TODO O(n) -> O(1)
        zero_or_one = [s for s in self._intervals + self.complement()._intervals if s.stop == sl.start]
        if len(zero_or_one) == 1:
            return zero_or_one[0]
        else:
            raise ValueError("Slice not found: {}.".format(sl))

    def _next_slice(self, sl: slice):
        """ :return The revealed or covered slice immediately to the right of @sl.
        :raise ValueError if there is none. """
        if sl.stop == self.universe.stop:
            raise ValueError("There is no slice to the right of {}.".format(sl))
        # TODO O(n)
        zero_or_one = [s for s in self._intervals + self.complement()._intervals if s.start == sl.stop]
        if len(zero_or_one) == 1:
            return zero_or_one[0]
        else:
            raise ValueError("Slice not found: {}.".format(sl))

    def include_virtual(self, from_index, to_index):
        if from_index is None or from_index < -len(self) or from_index >= len(self):
            p_from_index = None
        else:
            p_from_index = self.virtual2physical(from_index)
        if to_index is None or to_index < -len(self) or to_index >= len(self):
            p_to_index = None
        else:
            p_to_index = self.virtual2physical(to_index)
        return self.include(p_from_index, p_to_index)

    def include_partially_virtual(self, from_index: Optional[int], to_index: Optional[int], count: Union[int, tuple]):
        if from_index is None or from_index < -len(self) or from_index >= len(self):
            p_from_index = None
        else:
            p_from_index = self.virtual2physical(from_index)
        if to_index is None or to_index < -len(self) or to_index >= len(self):
            p_to_index = None
        else:
            p_to_index = self.virtual2physical(to_index)
        return self.include_partially(p_from_index, p_to_index, count)

    # FIXME Inconsistent with reversed(selection). Should probably make this use the default implementation and instead
    # rewrite this one to iter_slices or something.
    def __iter__(self):
        for a, b in self.pairs():
            yield a, b  # FIXME should probably generate slices instead, or every index

    def complement(self):
        if len(self._intervals) >= 1 and self._intervals[-1] == self.universe.stop:
            return Selection(universe=self.universe, intervals=self._intervals[:-1],
                             _length=self.universe.stop - len(self))
        return Selection(universe=self.universe, intervals=self._intervals.union([self.universe.stop]),
                         _length=self.universe.stop - len(self))

    def _normalized_range(self, from_index: Optional[int], to_index: Optional[int]) -> Tuple[int, int]:
        """ For any range [@from_index, @to_index) where the indices are either None or any integer, returns the
        equivalent range [x, y) such that either 0 <= x < y <= upper_bound or x = y = 0. The ranges are equivalent in
        the sense that when using them to slice this selection, they produce the same sub-selection. """
        if from_index is None or from_index <= -self.universe.stop:
            from_index = self.universe.start
        elif from_index > self.universe.stop:
            from_index = self.universe.stop
        elif -self.universe.stop <= from_index < 0:
            from_index = self.universe.stop - from_index

        if to_index is None or to_index >= self.universe.stop:
            to_index = self.universe.stop
        elif -self.universe.stop <= to_index < 0:
            to_index = self.universe.stop - to_index
        elif to_index < -self.universe.stop:
            to_index = self.universe.start

        if from_index >= to_index:
            from_index, to_index = (0, 0)
        return from_index, to_index

    def subslice(self, from_index: Optional[int], to_index: Optional[int]):
        from_index, to_index = self._normalized_range(from_index, to_index)
        sel = self._spanning_subslice(from_index, to_index)
        if len(sel._intervals) % 2 == 0:
            if len(sel) > 0:
                if sel._intervals[0] < from_index < sel._intervals[1]:
                    sel._revealed_count -= from_index - sel._intervals[0]
                    del sel._intervals[0]
                    sel._intervals.add(from_index)
                if sel._intervals[-2] < to_index < sel._intervals[-1]:
                    sel._revealed_count -= sel._intervals[-1] - to_index
                    del sel._intervals[-1]
                    sel._intervals.add(to_index)
        else:
            if 0 < from_index < sel._intervals[0]:
                sel._revealed_count -= from_index
                sel._intervals.add(from_index)
            if (len(sel._intervals) == 1 and to_index < sel._intervals[-1]
                    or len(sel._intervals) >= 2 and sel._intervals[-2] < to_index < sel._intervals[-1]):
                sel._revealed_count -= sel._intervals[-1] - to_index
                del sel._intervals[-1]
                sel._intervals.add(to_index)
        return sel

    def _spanning_subslice(self, from_index: int, to_index: int):
        """ :return A Selection whose set of revealed slices is a subset of that of this Selection such that every index
        in [from_index, to_index) is either on some slice in the subset, or on a gap. """
        if from_index >= to_index:
            return Selection(universe=deepcopy(self.universe), intervals=[])
        m = self._intervals.bisect_right(from_index)
        if len(self._intervals) % 2 == 0:
            n = self._intervals.bisect_left(to_index)
            intervals = self._intervals[m - (m % 2):n + (n % 2)]
        else:
            n = self._intervals.bisect_right(to_index)
            a = max(0, m - ((m + 1) % 2))
            b = n + ((n + 1) % 2)
            intervals = self._intervals[a:b]
        sel = Selection(universe=deepcopy(self.universe), intervals=intervals)
        return sel

    def _slow_subslice(self, from_index: Optional[int], to_index: Optional[int]):
        sel = self.deepcopy()
        if isinstance(from_index, int):
            sel.exclude(None, from_index)
        if isinstance(to_index, int):
            sel.exclude(to_index, None)
        return sel

    def _interval_index(self, pindex):
        """ :return n if the nth interval edge is the smallest number such that @pindex < n (zero-indexed). """
        lower = 0
        upper = len(self._intervals) - 1
        while lower <= upper:
            middle = (lower + upper) // 2
            midsl = self._intervals[middle]
            if pindex < midsl.start:
                upper = middle - 1
            elif midsl.stop <= pindex:
                lower = middle + 1
            else:  # midsl.start <= pindex < midsl.stop:
                return middle
        raise IndexError("{} is not in any interval.".format(pindex))

    def select(self, listlike):
        # TODO only works for stringlike objects
        lst = []
        for interval in self.slices():
            lst.append(listlike[interval])
        selection = listlike[0:0].join(lst)
        return selection

    def physical2virtual(self, pindex: int):
        vindex = 0
        for a, b in self.pairs():
            if a <= pindex < b:
                vindex += pindex - a
                return vindex
            vindex += b - a
        raise IndexError("Physical index {} out of bounds for selection {}".format(pindex, self))

    # TODO: O(n) -> O(log(n)) (using another sorted set for cumulative lengths?)
    def virtual2physical(self, vindex: int):  # TODO -> virtualint2physical
        """ :return the integer n such that where the @vindex'th revealed element is the nth element. If
        @vindex < 0, @vindex is interpreted as (number of revealed elements) + @vindex.
        """
        if vindex < -len(self):
            raise IndexError(
                "Got index {}, expected it to be within range [{},{})".format(vindex, -len(self), len(self)))
        elif vindex < 0:
            return self.virtual2physical(len(self) + vindex)
        cumlength = 0
        for a, b in self.pairs():
            cumlength += b - a
            if vindex < cumlength:
                pindex = b - (cumlength - vindex)
                if a <= pindex < b:
                    return pindex
                else:
                    break
        raise IndexError("Virtual index {} out of bounds for selection {}".format(vindex, self))

    def virtual2physicalselection(self, vslice: slice) -> 'Selection':  # TODO -> virtualslice2physical
        """ :return the sub-Selection that is the intersection of this selection and @vslice. """
        if not self._intervals or vslice.stop == 0:
            return Selection(self.universe, revealed=[])
        if vslice.start is None:
            a = self.virtual2physical(0)
        elif -len(self) <= vslice.start < len(self):
            a = self.virtual2physical(vslice.start)
        elif vslice.start >= len(self):
            a = self._intervals[-1]
        else:
            raise ValueError("Unexpected slice start: {}".format(vslice))
        if vslice.stop is None or vslice.stop >= len(self):
            b = self._intervals[-1] - 1
        elif -len(self) <= vslice.stop < len(self):
            b = self.virtual2physical(vslice.stop - 1)
        else:
            raise ValueError("Unexpected slice stop: {}".format(vslice))
        # INV: a is the physical index of the first element, b is the physical index of the last element
        if b < a:
            return Selection(universe=self.universe, revealed=[])
        m = self._intervals.bisect_right(a)
        n = self._intervals.bisect_right(b)
        intervals = SortedSet([a] + self._intervals[m:n] + [b + 1])
        return Selection(universe=self.universe, intervals=intervals)

    def virtualselection2physical(self, vselection: 'Selection'):  # TODO -> virtualslice2physical
        """ :return the sub-Selection that is the intersection of this selection and @vselection. """
        intervals = []
        for start, stop in vselection:
            for a, b in self.virtual2physicalselection(slice(start, stop)):
                intervals.append(slice(a, b))
        return Selection(universe=self.universe, revealed=intervals)

    def stretched(self, from_index: Optional[int], to_index: Optional[int]):  # TODO remove?
        """ :return A potentially shrinked deep copy of this selection, delimited by the universe
        [@from_index, @to_index). """
        m = self._intervals.bisect_right(from_index)
        n = self._intervals.bisect_right(to_index)
        intervals = self._intervals[m:n]
        return Selection(universe=slice(from_index, to_index), intervals=intervals)

    def __getitem__(self, item):
        return self.virtual2physical(item)

    @staticmethod
    def _compute_len(sortedset: SortedSet):
        """ :return The sum of the lengths of every slice in @slicelist. """
        if len(sortedset) == 0:
            return 0
        elif len(sortedset) % 2 == 0:
            return sum(sortedset[i + 1] - sortedset[i] for i in range(0, len(sortedset), 2))
        return sortedset[0] + sum(sortedset[i + 1] - sortedset[i] for i in range(1, len(sortedset), 2))

    def __len__(self):
        return self._revealed_count

    def __eq__(self, other):
        return repr(self) == repr(other)

    def __mul__(self, other: int):
        if other == 0:
            return Selection(universe=slice(0, 0), revealed=[])
        scaled_universe = slice(self.universe.start * other, self.universe.stop * other)
        scaled_revealed = [other * x for x in self._intervals]
        return Selection(universe=scaled_universe, intervals=scaled_revealed)

    def __rmul__(self, other):
        return self.__mul__(other)

    def __repr__(self):
        return "{}(universe={}, intervals={})".format(self.__class__.__name__, self.universe, self._intervals)

    def __str__(self):
        return repr(self)

    def deepcopy(self):
        """ :return A deep copy of this object. """
        return Selection(universe=deepcopy(self.universe), intervals=deepcopy(self._intervals))
Example #12
0
class OracleFunction(Oracle):
    def __init__(self):
        # type: (OracleFunction) -> None
        """
        An OracleFunction is a set of Conditions.
        """
        # super(OracleFunction, self).__init__()
        Oracle.__init__(self)
        self.variables = SortedSet([], key=default_sort_key)
        self.oracle = set()

    def __repr__(self):
        # type: (OracleFunction) -> str
        """
        Printer.
        """
        return self._to_str()

    def __str__(self):
        # type: (OracleFunction) -> str
        """
        Printer.
        """
        return self._to_str()

    def _to_str(self):
        # type: (OracleFunction) -> str
        """
        Printer.
        """
        return str(self.oracle)

    def __eq__(self, other):
        # type: (OracleFunction, OracleFunction) -> bool
        """
        self == other
        """
        return self.oracle == other.oracle

    def __ne__(self, other):
        # type: (OracleFunction, OracleFunction) -> bool
        """
        self != other
        """
        return not self.__eq__(other)

    def __hash__(self):
        # type: (OracleFunction) -> int
        """
        Identity function (via hashing).
        """
        return hash(tuple(self.oracle))

    def add(self, cond):
        # type: (OracleFunction, Condition) -> None
        """
        Addition of a new condition to the Oracle.

        Args:
            self (OracleFunction): The OracleFunction.
            cond (Condition): The file where the Oracle will
                                   be saved.

        Returns:
            None: A new condition is appended to the list of conditions
            of the OracleFunction.

        Example:
        >>> ora = OracleFunction()
        >>> cond = Condition("x + y", ">=", "0")
        >>> ora.add(cond)
        """
        self.variables = self.variables.union(cond.get_variables())
        self.oracle.add(cond)

    def dim(self):
        # type: (OracleFunction) -> int
        """
        See Oracle.dim().
        """
        return len(self.get_variables())

    def get_var_names(self):
        # type: (OracleFunction) -> list
        """
        See Oracle.get_var_names().
        """
        return [str(i) for i in self.variables]

    def get_variables(self):
        # type: (OracleFunction) -> list
        """
        Returns the list of variables of all the polynomial expressions
        (i.e., Conditions) stored in the OracleFunction.

        Args:
            self (OracleFunction): The OracleFunction.

        Returns:
            list: The list of variables (Symbols).

        Example:
        >>> cond1 = Condition("2x - 4y", ">=", "0")
        >>> cond2 = Condition("2x + z", ">=", "0")
        >>> ora = OracleFunction()
        >>> ora.add(cond1)
        >>> ora.add(cond2)
        >>> ora.get_variables()
        >>> [Symbol('x'), Symbol('y'), Symbol('z')]
        """
        # variable_list = sorted(self.variables, key=default_sort_key)
        variable_list = list(self.variables)
        return variable_list

    def _eval_var_val(self, var=None, val='0'):
        # type: (OracleFunction, Symbol, int) -> bool
        _eval_list = [cond.eval_var_val(var, val) for cond in self.oracle]
        # All conditions are true (i.e., 'and' policy)
        _eval = all(_eval_list)
        # Any condition is true (i.e., 'or' policy)
        # _eval = any(_eval_list)
        return _eval

    def _eval_tuple(self, point):
        # type: (OracleFunction, tuple) -> bool
        _eval_list = [cond.eval_tuple(point) for cond in self.oracle]
        # All conditions are true (i.e., 'and' policy)
        _eval = all(_eval_list)
        # Any condition is true (i.e., 'or' policy)
        # _eval = any(_eval_list)
        return _eval

    def _eval_zip_tuple(self, var_point):
        # type: (OracleFunction, list) -> bool
        _eval_list = [cond.eval_zip_tuple(var_point) for cond in self.oracle]
        # All conditions are true (i.e., 'and' policy)
        _eval = all(_eval_list)
        # Any condition is true (i.e., 'or' policy)
        # _eval = any(_eval_list)
        return _eval

    def _eval_dict(self, d=None):
        # type: (OracleFunction, dict) -> bool
        _eval_list = [cond.eval_dict(d) for cond in self.oracle]
        # All conditions are true (i.e., 'and' policy)
        _eval = all(_eval_list)
        # Any condition is true (i.e., 'or' policy)
        # _eval = any(_eval_list)
        return _eval

    def __contains__(self, point):
        # type: (OracleFunction, tuple) -> bool
        """
        Synonym of self.member(point).
        A point belongs to the Oracle if it satisfies all the conditions.
        """
        return self.member(point) is True

    def _member_zip_tuple(self, point):
        # type: (OracleFunction, tuple) -> bool
        # keys = [x, y, z]
        keys = self.variables
        # point = (2, 4, 0)
        # var_point = [(x, 2), (y, 4), (z, 0)]
        var_point = list(zip(keys,
                             point))  # Works in Python 2.7 and Python 3.x
        # var_point = zip(keys, point) # Works only in Python 2.7
        return self._eval_zip_tuple(var_point)

    def _member_dict(self, point):
        # type: (OracleFunction, tuple) -> bool
        # keys = [x, y, z]
        keys = self.variables
        # point = (2, 4, 0)
        # di = {x: 2, y: 4, z: 0}
        di = {key: point[i] for i, key in enumerate(keys)}
        return self._eval_dict(di)

    def member(self, point):
        # type: (OracleFunction, tuple) -> bool
        """
        See Oracle.member().
        A point belongs to the Oracle if it satisfies all the conditions.
        """
        # member_zip_var performs better than member_dict
        return self._member_zip_tuple(point)
        # return self.member_dict(point)

    def membership(self):
        # type: (OracleFunction) -> callable
        """
        See Oracle.membership().
        """
        return lambda point: self.member(point)

    # Read/Write file functions
    def from_file_binary(self, finput=None):
        # type: (OracleFunction, io.BinaryIO) -> None
        """
        See Oracle.from_file_binary().
        """
        assert (finput is not None), 'File object should not be null'

        self.oracle = pickle.load(finput)
        self.variables = pickle.load(finput)

    def from_file_text(self, finput=None):
        # type: (OracleFunction, io.BinaryIO) -> None
        """
        See Oracle.from_file_text().
        """
        assert (finput is not None), 'File object should not be null'

        # Each line has a Condition
        for line in finput:
            cond = Condition()
            cond.init_from_string(line)
            self.add(cond)

    def to_file_binary(self, foutput=None):
        # type: (OracleFunction, io.BinaryIO) -> None
        """
        See Oracle.to_file_binary().
        """
        assert (foutput is not None), 'File object should not be null'

        pickle.dump(self.oracle, foutput, pickle.HIGHEST_PROTOCOL)
        pickle.dump(self.variables, foutput, pickle.HIGHEST_PROTOCOL)

    def to_file_text(self, foutput=None):
        # type: (OracleFunction, io.BinaryIO) -> None
        """
        See Oracle.to_file_text().
        """
        assert (foutput is not None), 'File object should not be null'

        # Each line has a Condition
        for cond in self.oracle:
            cond.to_file_text(foutput)
Example #13
0
    def finalize(self):
        if self.write_operators and os.path.isfile(self.op_yaml_file):
            os.remove(self.op_yaml_file)

        all_operator_set_ops = SortedSet()
        if self.operator_sets:
            all_operator_set_ops = SortedSet.union(
                *[op_set.operators for op_set in self.operator_sets])

        # create docs
        if self.split_pdfs:
            docs = dict()
            for channel in self.channels:
                operators = [
                    op for op in self.data_handler.getChannelOperators(channel)
                    if op not in all_operator_set_ops
                    and op not in self.excluded_operators
                ]
                if not operators:
                    continue

                if channel.irrep_psq_key not in docs:
                    docs[channel.irrep_psq_key] = util.create_doc(
                        f"Correlators and Effective Energies: {self.ensemble_name} - {self.task_name} - {channel.irrep_psq_key}"
                    )

            for operator_set in self.operator_sets:
                docs[operator_set.name] = util.create_doc(
                    f"Correlators and Effective Energies: {self.ensemble_name} - {self.task_name} - {operator_set.name}"
                )

        else:
            doc = util.create_doc(
                f"Correlators and Effective Energies: {self.ensemble_name} - {self.task_name}"
            )

        # create content
        for channel in self.channels:
            if self.split_pdfs:
                doc = docs[channel.irrep_psq_key]

            data_files = self.data_files + self.data_handler.getChannelDataFiles(
                channel)

            operators = [
                op for op in self.data_handler.getChannelOperators(channel)
                if op not in all_operator_set_ops
                and op not in self.excluded_operators
            ]
            if not operators:
                continue

            if self.write_operators:
                operator_info.operator_set.write_operators(
                    self.op_file(repr(channel)), operators, True, False)
                operator_info.operator_set.write_operators_to_yaml(
                    self.op_yaml_file, repr(channel), operators, True)

            with doc.create(pylatex.Section(str(channel))):
                self.addPlotsToPDF(doc, data_files, operators, repr(channel))

        for operator_set in self.operator_sets:
            if self.write_operators:
                operator_info.operator_set.write_operators(
                    self.op_file(operator_set.name), operator_set.operators,
                    True, False)
                operator_info.operator_set.write_operators_to_yaml(
                    self.op_yaml_file, operator_set.name,
                    operator_set.operators, True)

            data_files = self.data_files
            for channel in operator_set.channels:
                channel_data_files = self.data_handler.getChannelDataFiles(
                    channel)
                data_files += self.data_handler.getChannelDataFiles(channel)

            if self.split_pdfs:
                self.addPlotsToPDF(doc, data_files, operator_set.operators,
                                   operator_set.name)

            else:
                with doc.create(pylatex.Section(operator_set.name)):
                    self.addPlotsToPDF(doc, data_files, operator_set.operators,
                                       operator_set.name)

        # compile
        if self.split_pdfs:
            for split_key, doc in docs.items():
                filename = os.path.join(
                    self.results_dir,
                    f"{util.str_to_file(self.task_name)}_{split_key}_rebin{self.rebin}"
                )
                util.compile_pdf(doc, filename, self.latex_compiler)
        else:
            filename = os.path.join(
                self.results_dir,
                f"{util.str_to_file(self.task_name)}_rebin{self.rebin}")
            util.compile_pdf(doc, filename, self.latex_compiler)
Example #14
0
class Genotype(object):
    def __init__(self):
        self._samples = []
        self._loci = SortedSet([])
        self._data = dict()

    def add_loci(self, loci):
        self._loci = self._loci.union(loci)
        return self

    def add(self, sample, loci, genotype):
        if len(loci) != len(genotype):
            raise ValueError("Inconsistent loci and genotype sizes")
        self.add_loci(loci)  # append loci if necessary
        self._samples.append(sample)
        self._data[sample] = dict()
        for i in range(len(loci)):
            self._data[sample][loci[i]] = genotype[i]
        return self

    def merge(self, other):
        if type(self) != type(other):
            raise ValueError("Must be of type `Genotype` to merge")
        self._loci = self._loci.union(other.loci)
        self._samples += other.samples
        self._data.update(other._data)
        return self

    @property
    def loci(self):
        return self._loci

    @property
    def n_loci(self):
        return len(self._loci)

    @property
    def samples(self):
        return self._samples

    @property
    def n_samples(self):
        return len(self._samples)

    def get(self, sample, loci):
        if (sample not in self._samples) or (loci not in self._loci):
            raise KeyError()
        return self._data[sample].get(loci, ("-9", "-9"))

    def write(self, stream):
        print("", " ".join(self.loci), file=stream)
        for sample, geno in self:
            line0 = [sample]
            line1 = [sample]
            for locus in self.loci:
                if locus not in geno:
                    line0.append("-9")
                    line1.append("-9")
                    continue
                line0.append(geno[locus][0])
                line1.append(geno[locus][1])
            print(" ".join(line0), file=stream)
            print(" ".join(line1), file=stream)

    def __iter__(self):
        for sample in self._samples:
            yield sample, self._data[sample]

    @classmethod
    def combine(cls, one, other):
        return cls().merge(one).merge(other)

    @classmethod
    def parse_str(cls, stream):
        genotype = cls()
        loci = next(stream).rstrip().split()
        loci = [_sanitize_locus(l) for l in loci]
        while True:
            line0 = next(stream, None)
            line1 = next(stream, None)
            if line0 is None or line1 is None:
                break
            name, *gen0 = line0.rstrip().split()
            gen1 = line1.rstrip().split()[1:]
            name = _sanitize_name(name)
            geno = [value for value in zip(gen0, gen1)]
            genotype.add(name, loci, geno)
        return genotype

    @classmethod
    def parse_delim(cls, stream, delimiter=None):
        genotype = cls()
        loci = next(stream).rstrip("\n").split(delimiter)[1:]
        loci = [_sanitize_locus(l) for l in loci]
        for line in stream:
            name, *geno = line.strip("\n").split(delimiter)
            name = _sanitize_name(name)
            geno = [_sanitize_genotype(g) for g in geno]
            genotype.add(name, loci, geno)
        return genotype

    @classmethod
    def parse_file(cls, stream, format="txt"):
        if format == "str":
            return cls.parse_str(stream)
        elif format in FORMAT_DELIMITER:
            return cls.parse_delim(stream, FORMAT_DELIMITER[format])
        else:
            raise ValueError("Invalid file format")
Example #15
0
class SparseTimeSeriesDataSet:
    # A dataset designed for dealing with sparse time series data that needs to be kept in sync in time.
    def __init__(self, unique_timestamps = None, minimum_time_between_timestamps = None, mode='strict'):
        # possible modes are strict, remove_difference, union
        if unique_timestamps is not None:
            self.unique_timestamps = SortedSet(unique_timestamps)
        else:
            self.unique_timestamps = SortedSet()

        self.mode = mode
        self.all_raw_data = {}

        #dict of sorteddicts
        self.timestamp_indexed_data = {}

        self.minimum_time_between_timestamps = minimum_time_between_timestamps
        self.check_minimum_timestamp_interval()


    def __len__(self):
        return len(self.unique_timestamps)

    @classmethod
    def sample_data_at_intervals(cls, start_timestamp, end_timestamp, interval, data):
        # extends previous datapoint if one is missing
        timestamps = SortedList([x[0] for x in data])

        start_timestamp = int(start_timestamp)
        end_timestamp = int(end_timestamp)

        assert(timestamps[0] <= start_timestamp)
        assert(timestamps[-1] >= end_timestamp)
        sampled_data = []

        for timestamp in range(start_timestamp, end_timestamp+1, interval):
            index = timestamps.bisect_right(timestamp)-1
            new_datapoint = data[index].copy()
            new_datapoint[0] = timestamp
            sampled_data.append(new_datapoint)

        return sampled_data

    @property
    def ids(self):
        return list(self.all_raw_data.keys())

    @property
    def first_timestamp(self):
        return self.unique_timestamps[0]

    def first_timestamp_for_id(self, id):
        return self.all_raw_data[id][0][0]

    @property
    def last_timestamp(self):
        return self.unique_timestamps[-1]

    def last_timestamp_for_id(self, id):
        return self.all_raw_data[id][-1][0]

    def first_unpadded_index_for_id(self, id):
        first_timestamp = self.first_timestamp_for_id(id)
        return self.unique_timestamps.index(first_timestamp)

    def last_unpadded_index_for_id(self, id):
        last_timestamp = self.last_timestamp_for_id(id)
        return self.unique_timestamps.index(last_timestamp)


    def check_minimum_timestamp_interval(self):
        if self.minimum_time_between_timestamps is not None:
            prev_timestamp = 0
            for timestamp in self.unique_timestamps:
                if timestamp-prev_timestamp < self.minimum_time_between_timestamps:
                    raise InvalidTimestampsInDataError("Found timestamps that have less than the required {} between them".format(self.minimum_time_between_timestamps))
                prev_timestamp = timestamp

    def add(self, id: str, data):
        if len(data) == 0:
            raise ValueError("Tried to add empty data for id {}".format(id))

        if id in self.all_raw_data and self.all_raw_data[id] == data:
            print("Data for id {} already added.".format(id))
            return

        self.all_raw_data[id] = data

        if len(data[0]) > 2:
            # we have multidimensional data
            timestamp_indexed_data = SortedDict([[int(x[0]), x[1:]] for x in data])
        else:
            timestamp_indexed_data = SortedDict([[int(x[0]), x[1]] for x in data])


        new_timestamps = {x[0] for x in data}
        difference = new_timestamps.difference(self.unique_timestamps)

        if self.mode == 'strict':
            if len(difference) != 0:
                raise InvalidTimestampsInDataError("Tried to add new data with id {} that includes timestamps that are not in the set of allowed timestamps. "
                                                   "Difference = {}".format(id, difference))
            opposite_difference = self.unique_timestamps.difference(new_timestamps)
            # for timestamp_current in opposite_difference:
            #     if timestamp_current > min(new_timestamps) and timestamp_current < max(new_timestamps):
            #         raise Exception("Missing timestamps in the middle of the data")

        elif self.mode == 'remove_difference':
            for timestamp_to_remove in difference:
                del(timestamp_indexed_data[timestamp_to_remove])

        elif self.mode == 'union':
            self.unique_timestamps = self.unique_timestamps.union(new_timestamps)

        self.check_minimum_timestamp_interval()

        if len(timestamp_indexed_data) == 0:
            raise NotEnoughInputData("The data being added has zero length. If the mode is remove_difference, then this means that the new data has no timestamps in common with the required timestamps")

        self.timestamp_indexed_data[id] = timestamp_indexed_data


    def get_left_and_right_padding_required(self, ids):
        padding_required = []
        for id in ids:
            first_timestamp_for_id = self.first_timestamp_for_id(id)
            last_timestamp_for_id = self.last_timestamp_for_id(id)
            left_padding = self.unique_timestamps.index(first_timestamp_for_id)
            right_padding = len(self) - self.unique_timestamps.index(last_timestamp_for_id)-1

            assert(self.all_raw_data[id][0][0] == self.unique_timestamps[left_padding])
            assert(self.all_raw_data[id][-1][0] == self.unique_timestamps[-(right_padding+1)])

            padding_required.append([left_padding, right_padding])
        return padding_required

    def get_data_extend_missing_internal(self, id: str):
        # This function does't pad the left or right of the data, but it will fill in any missing data
        # using the previous value
        timestamp_indexed_data = self.timestamp_indexed_data[id]

        timestamps_in_this_data = set(timestamp_indexed_data.keys())
        missing_timestamps = self.unique_timestamps - timestamps_in_this_data

        if len(missing_timestamps) > 0:
            for timestamp in missing_timestamps:
                entry_index = timestamp_indexed_data.bisect_right(timestamp)

                if entry_index != 0 and entry_index < len(timestamp_indexed_data):
                    # only pad in the middle of the data and not at the end
                    current_padded_value = timestamp_indexed_data.peekitem(entry_index - 1)[1]
                    timestamp_indexed_data[timestamp] = current_padded_value

        if isinstance(timestamp_indexed_data.peekitem(0)[1], list) or isinstance(timestamp_indexed_data.peekitem(0)[1], tuple):
            to_return = [[x[0], *x[1]]for x in timestamp_indexed_data.items()]
        else:
            to_return = list(timestamp_indexed_data.items())
        return to_return


    def get_padded_data_in_sync(self, padding_val = "extend"):
        # It will always pad missing values in the middle or end of the data by extending the previous value.
        # The padding_val variable determined how to pad the beginning when there is no value before it.
        padded_timestamp_indexed_data = {}

        for ric, timestamp_indexed_data in self.timestamp_indexed_data.items():
            padded_timestamp_indexed_data[ric] = timestamp_indexed_data

            timestamps_in_this_data = set(timestamp_indexed_data.keys())
            missing_timestamps = self.unique_timestamps - timestamps_in_this_data

            if len(missing_timestamps) > 0:
                for timestamp in missing_timestamps:
                    entry_index = padded_timestamp_indexed_data[ric].bisect_right(timestamp)
                    if entry_index == 0:
                        if padding_val == 'extend':
                            current_padded_value = padded_timestamp_indexed_data[ric].peekitem(entry_index)[1]
                        else:
                            current_padded_value = padding_val
                    else:
                        current_padded_value = padded_timestamp_indexed_data[ric].peekitem(entry_index-1)[1]

                    padded_timestamp_indexed_data[ric][timestamp] = current_padded_value

        return padded_timestamp_indexed_data


    def get_start_and_end_index_for_concat_data(self, keys):
        start_stop = []
        current_position = 0
        for id in keys:
            if id in self.timestamp_indexed_data:
                length_of_data = len(self.timestamp_indexed_data[id])
                start_stop.append([current_position,current_position+length_of_data])
                current_position = length_of_data
            else:
                print("warning: tried to concat data for keys {} but key {} is missing".format(keys, id))

        return start_stop


    def concat_data_unpadded(self, keys, as_numpy = True, with_timestamps = True):
        data_to_concat = []
        for id in keys:
            if id in self.timestamp_indexed_data:
                if with_timestamps:
                    data_to_concat.append(np.squeeze(self.timestamp_indexed_data[id].items()[:]))
                else:
                    data_to_concat.append(np.squeeze(self.timestamp_indexed_data[id].values()[:]))
            else:
                print("warning: tried to concat data for keys {} but key {} is missing".format(keys, id))


        if as_numpy:
            return np.concatenate(data_to_concat)
        else:
            return np.concatenate(data_to_concat).tolist()