Ejemplo n.º 1
0
def count_retrieved_messages(mailBox):
    import re
    from collections import Counter as counter

    addresses = []
    for mail in mailBox:
        (date, address, subject) = get_message_info(mail)
        matches = re.findall(r'<(.+?)>', address)
        addresses.append("".join(matches))

    return counter(addresses), sorted((counter(addresses)).items(), key=lambda kv: kv[1])
Ejemplo n.º 2
0
def life(cells):
    """Takes a list of (x,y) points, and returns the next generation of the Game of Life."""
    neighbours = counter(
        [(x + xo, y + yo) for x, y in cells for xo in [-1, 0, 1] for yo in [-1, 0, 1]]
    )
    return counter(
        [
            cell
            for cell in neighbours
            if neighbours[cell] == 3 or (neighbours[cell] == 4 and cell in cells)
        ]
    )
Ejemplo n.º 3
0
def ransom_note(magazine, ransom):
    if len(ransom) > len(magazine):
        return False

    magazine_dict = counter(magazine)
    ransom_dict = counter(ransom)

    for key, value in ransom_dict.items():
        if key not in magazine_dict:
            return False
        elif value > magazine_dict[key]:
            return False

    return True
Ejemplo n.º 4
0
def minWindowSub(source, target):
	if not source or not target:
		return ""
	s = counter()
	t = counter()
	for i in source:
		s[i] += 1
	for j in target:
		t[j] += 1

	start = 0
	end = 0
	while start < len(source):
		for i in range(len(target)):
Ejemplo n.º 5
0
def scopes_size(scopes: Scopes) -> Counter:
    """ Calculate the different scope lengths.

    Parameters
    ----------
    scopes : DefaultDict
        Dictionary of cells (keys) and their scopes
 
    Returns
    -------
    Counter :
        Counter of scopes lengths (key) and their frequency (values).
            
    See Also
    --------
    get_scopes
 
    Examples
    --------
    >>> import numpy as np
    >>> scopes = structure(2, 3)[2] 
    >>> scopes_size(scopes) == Counter({2: 4, 3: 4, 4: 1})
    True
    """

    return counter([len(scope) for scope in scopes.values()])
Ejemplo n.º 6
0
def calc_mode(list1):
    # max(set(list)),key=list.count()
    data = collections.counter(list1)
    datadic = dict(data)
    max_value = max(datadic.values())
    mode = [num for num, freq in datadic if freq == max_value]
    return mode
Ejemplo n.º 7
0
def download_many(cc_list, base_url, verbose, concun_req):
	conter = collections.counter()
	with futures.ThreadPoolExecutor(max_workers=concun_req) as executor:
#把 max_workers 设为 concur_req,创建 ThreadPoolExecutor 实例
		to_do_map = {} #这个字典把各个 Future 实例(表示一次下载)映射到相应的国家代码上,在处理错误时使用
		for cc in sorted(cc_list):
			future = executor.submit(download_one, cc, base_url, verbose)
#每次调用 executor.submit 方法排定一个可调用对象的执行时间,然后返回一个 Future 实例
		to_do_map[future] = cc #把返回的 future 和国家代码存储在字典中。
	dont_iter = futures.as_completed(to_do_map)
#futures.as_completed 函数返回一个迭代器,在期物运行结束后产出期物
	if not verbose:
		dont_iter = tqdm.tqdm(dont_iter, total=len(cc_list))
#如果不是详细模式,把 as_completed 函数返回的结果传给 tqdm 函数,显示进度条
	for future in dont_iter: #迭代运行结束后的期物
		try:
			res = future.result()
#在期物上调用 result 方法,要么返回可调用对象的返回值,要么抛出可调用的对象在执行过程中捕获的异常。这个方法可能会阻塞,等待确定结果
		except requests.exceptions.HTTPError as exc: 
			error_msg = 'HTTP {res.status_code} - {res.reason}'
			error_msg = error_msg.format(res=exc.response)
		except requests.exceptions.ConnectionError as exc:
			error_msg = 'Connection error'
		else:
			error_msg = ''
			status = res.status
		if error_msg:
			status = HTTPStatus.error
			counter[status] += 1
		if verbose and error_msg:
			cc = to_do_map[future]
#为了给错误消息提供上下文,以当前的 future 为键,从to_do_map 中获取国家代码。
			print('*** Error for {}: {}'.format(cc, error_msg))
	return counter
Ejemplo n.º 8
0
def glove(n, arr):
    n = counter(arr)
    return sum(i // 2 for i in n.values())

    n = input()
    arr = list(map(int, input().split()))
    print(glove(n, arr))
Ejemplo n.º 9
0
    def predict_labels(self, dists, k=1):
        """
        Given a matrix of distances between test points and training points,
        predict a label for each test point.

        Inputs:
        - dists: A numpy array of shape (num_test, num_train) where dists[i, j]
            gives the distance between the ith test point and the jth training point.

        Returns:
        - y: A numpy array of shape (num_test,) containing predicted labels for the
            test data, where y[i] is the predicted label for the test point X[i].
        """
        num_test = dists.shape[0]
        y_pred = np.zeros(num_test)

        print 'start predicting labels'

        for i in xrange(num_test):
            print i
            # A list of length k storing the labels of the k nearest neighbors to
            # the ith test point.
            closest_y = self.y_train[np.argsort(dists[i])[:k]]

            # Naive way (just assign the 0-index of closest_y
            y_pred[i] = closest_y[0]
            # Better way, count which class appears most
            y_pred[i] = counter(closest_y).most_common(1)[0][0]

        return y_pred
Ejemplo n.º 10
0
    def _filter_atomic_property(self, zeta_data, selected_atoms):
        zeta_data['atomic_number'] = zeta_data.index.labels[0] + 1
        zeta_data['ion_number'] = zeta_data.index.labels[1] + 1
        zeta_data = zeta_data[zeta_data.atomic_number.isin(selected_atoms)]
        zeta_data_check = counter(zeta_data.atomic_number.values)
        keys = np.array(zeta_data_check.keys())
        values = np.array(zeta_data_check.values())
        if np.alltrue(keys + 1 == values):
            return zeta_data
        else:
#            raise IncompleteAtomicData('zeta data')
# This currently replaces missing zeta data with 1, which is necessary with
# the present atomic data. Will replace with the error above when I have
# complete atomic data.
            logger.warn('Zeta_data missing - replaced with 1s')
            updated_index = []
            for atom in selected_atoms:
                for ion in range(1, atom + 2):
                    updated_index.append([atom, ion])
            updated_index = np.array(updated_index)
            updated_dataframe = pd.DataFrame(index=pd.MultiIndex.from_arrays(
                updated_index.transpose().astype(int)),
                columns=zeta_data.columns)
            for value in range(len(zeta_data)):
                updated_dataframe.ix[zeta_data.atomic_number.values[value]].ix[
                    zeta_data.ion_number.values[value]] = \
                    zeta_data.ix[zeta_data.atomic_number.values[value]].ix[
                        zeta_data.ion_number.values[value]]
            updated_dataframe = updated_dataframe.astype(float)
            updated_index = pd.DataFrame(updated_index)
            updated_dataframe['atomic_number'] = np.array(updated_index[0])
            updated_dataframe['ion_number'] = np.array(updated_index[1])
            updated_dataframe.fillna(1.0, inplace=True)
            return updated_dataframe
Ejemplo n.º 11
0
    def _filter_atomic_property(self, zeta_data, selected_atoms):
        zeta_data['atomic_number'] = zeta_data.index.labels[0] + 1
        zeta_data['ion_number'] = zeta_data.index.labels[1] + 1
        zeta_data = zeta_data[zeta_data.atomic_number.isin(selected_atoms)]
        zeta_data_check = counter(zeta_data.atomic_number.values)
        keys = np.array(zeta_data_check.keys())
        values = np.array(zeta_data_check.values())
        if np.alltrue(keys + 1 == values):
            return zeta_data
        else:
#            raise IncompleteAtomicData('zeta data')
# This currently replaces missing zeta data with 1, which is necessary with
# the present atomic data. Will replace with the error above when I have
# complete atomic data.
            logger.warn('Zeta_data missing - replaced with 1s')
            updated_index = []
            for atom in selected_atoms:
                for ion in range(1, atom + 2):
                    updated_index.append([atom, ion])
            updated_index = np.array(updated_index)
            updated_dataframe = pd.DataFrame(index=pd.MultiIndex.from_arrays(
                updated_index.transpose().astype(int)),
                columns=zeta_data.columns)
            for value in range(len(zeta_data)):
                updated_dataframe.ix[zeta_data.atomic_number.values[value]].ix[
                    zeta_data.ion_number.values[value]] = \
                    zeta_data.ix[zeta_data.atomic_number.values[value]].ix[
                        zeta_data.ion_number.values[value]]
            updated_dataframe = updated_dataframe.astype(float)
            updated_index = pd.DataFrame(updated_index)
            updated_dataframe['atomic_number'] = np.array(updated_index[0])
            updated_dataframe['ion_number'] = np.array(updated_index[1])
            updated_dataframe.fillna(1.0, inplace=True)
            return updated_dataframe
Ejemplo n.º 12
0
 def query(self, point):
     cnt = collections.counter()
     for rep in self.reprs(point, self.tq):
         cnt[rep] += 1
         for point2 in self.lists[rep]:
             yield point2
     print(f'queried {sum(cnt.values())} buckets. {len(cnt)} unique.')
Ejemplo n.º 13
0
def _get_intents_and_slots(frame: Node, tree_based: bool) -> IntentsAndSlots:
    intents: Counter[Node] = counter()
    slots: Counter[Node] = counter()

    def process_node(node: Node, is_intent: bool) -> None:
        for child in node.children:
            process_node(child, not is_intent)
        if not tree_based:
            node = type(node)(node.label, deepcopy(node.span), text=node.text)
        if is_intent:
            intents[node] += 1
        else:
            slots[node] += 1

    process_node(frame, True)
    return IntentsAndSlots(intents=intents, slots=slots)
Ejemplo n.º 14
0
def get_expected_messages(stream):
    """Parses a file and get expected messages.

    :param stream: File-like input stream.
    :returns: A dict mapping line,msg-symbol tuples to the count on this line.
    """
    messages = collections.counter()
    for i, line in enumerate(stream):
        match = _EXPECTED_RE.search(line)
        if match is None:
            continue
        line = match.group('line')
        if line is None:
            line = i + 1
        elif line.startswith('+') or line.startswith('-'):
            line = i + 1 + int(line)
        else:
            line = int(line)

        version = match.group('version')
        op = match.group('op')
        if version:
            required = parse_python_version(version)
            if not _OPERATORS[op](sys.version_info, required):
                continue

        for msg_id in match.group('msgs').split(','):
            messages[line, msg_id.strip()] += 1
    return messages
    def get_sequence_weights(self):
        """
        Return the calculated sequence weights for all sequences in the MSA.
        The order of weights in the array must be equal to the order of the
        sequences in the MSA.

        :return: Numpy array (dtype=numpy.float64) containing the weights for
                 all sequences in the MSA.
        """
        # weights = np.zeros(self.get_size()[0], dtype=np.float64)
        self.r_vals = np.zeros(self.get_size()[1], dtype=np.int64)
        cell_weight = np.zeros_like(self.sequences, dtype=np.float64)
        for i in range(self.get_size()[1]):
            column = self.sequences[:,
                                    i]  # equivalent to np.take(self.sequences, i, axis=1)
            counts = counter(column)
            # inefficient way
            """ s_vals = np.array([counts[val] for val in column])
            self.r_vals[i] = len(counts)
            for j in range(len(column)):
                cell_weight[j][i] = 1/(self.r_vals[i]*counts[self.sequences[j][i]]) """
            # more efficient way
            s_vals = np.array(list(map(lambda key: counts[key], column)),
                              dtype=np.int64)
            self.r_vals[i] = len(counts)
            if self.r_vals[i] > 1:
                # corresponds to S_i,k
                cell_weight[:, i] = np.divide(
                    1, np.multiply(self.r_vals[i], s_vals)
                )  # Or cell_weight[:, i] = 1/(self.r_vals[i]*s_vals[:])
        weights = cell_weight.sum(1)
        # weights = [sum(x) for x in cell_weight]
        return weights.astype(np.float64)
def arrayToCdf(array):
    c = counter(array)
    k = sorted(c.keys())
    total = sum(c.values())
    v = [float(c[key]) for key in k]
    pdf = [x/total for x in v]
    cdf = np.cumsum(pdf)
    return [k, cdf]
Ejemplo n.º 17
0
 def _get_received(self):
     messages = self._linter.reporter.messages
     messages.sort(key=lambda m: (m.line, m.symbol, m.msg))
     received_msgs = collections.counter()
     received_output_lines = []
     for msg in messages:
         received_msgs[msg.line, msg.symbol] += 1
         received_output_lines.append(OutputLine.from_msg(msg))
     return received_msgs, received_output_lines
Ejemplo n.º 18
0
 def _check_same_line_imports(self, node):
     # Detect duplicate imports on the same line.
     names = (name for name, _ in node.names)
     counter = collections.counter(names)
     for name, count in counter.items():
         if count > 1:
             self.add_message('reimported',
                              node=node,
                              args=(name, node.fromlineno))
Ejemplo n.º 19
0
def stats_text_en(text):
    import collections
    if not isinstance(text,str): # Day8 添加参数类型检查
        raise ValueError('输入的不是文本格式,请重新输入:') 
    text = text.replace('.', '').replace('!', '').replace('--', '').replace('*', '').replace(',', '').replace('(', '').replace(')', '').replace(';', '').replace(':', '').replace('\'', '').replace('?', '').replace('_', '').replace('-', '').replace('/', '') .replace('[', '') .replace(']', '') .replace('\\', '') .replace('\"', '').replace('{', '').replace('}', '').replace('\t', '').replace('\n', '').replace('\r\n', '')    # 去除各种标点符号和空格
    list_text = text.split() # 将string转换为list
    count = int(input("请输入要限制输出的元素个数:"))
    dic = collections.counter(list_text).most_common(count)
    return dic
Ejemplo n.º 20
0
def solve(arr):
    c = counter(arr)
    dicc_sorted = dict()
    print(c)
    for key in c.keys():
    	
    return 0


print(solve([1,2,3,0,5,0,1,6,8,8,6,9,1]))
Ejemplo n.º 21
0
def stats_text_en(text):  #定义函数
    import collections
    if not isinstance(text, str):
        raise ValueError('参数必须是 str 类型,输入类型 %s' % type(text))
    text = text.replace(',', '').replace('.', '').replace('!', '').replace(
        '--', '').replace('*', '').replace('(', '').replace(')', '')
    list_text = text.split()
    count = int(input("请输入要限制输出的元素个数:"))
    dic = collections.counter(list_text).most_common(count)
    return dic
Ejemplo n.º 22
0
    def minor_earned(self) -> Counter[Player]:
        """ Determine how much money all players indirectly earned in this exercise.

        Returns:
            The amount of money that every player is indirectly responsible for.
        """
        earned = counter()
        for player, earning in it.product(self.alive, self.earned):
            earned[player] += earning.minor_earned(player)
        return earned
Ejemplo n.º 23
0
    def update_filter(
        self,
        criteria: List[Criterion],
        operation: Operation,
        quantifier: str,
    ) -> None:
        """Update the selected programs and/or impart the associated taxa and/or mark them as hidden.

        Description:
            - If the operation is `"impart"`:
                - calculate the appropriate sets of programs and taxa;
                - remove these programs from `self.selected_programs`;
                - add these taxa to `self.imparted_knowledge`.
            - If the operation is `"hide"`:
                - calculate the appropriate sets of programs and taxa;
                - add these taxa to `self.hidden_taxa`.
            - Otherwise (the operation is either `"include"` or `"exclude"`):
                - calculate the appropriate bag of programs: this bag counts, for each program,
                    the number of criteria they meet (maximum: size of `criteria`);
                - if `quantifier` is `"all"`, remove from this bag all programs which do not
                    meet at least one criterion;
                - include or exclude the resulting programs. Note that the `"exclude"` operation
                    extends to the programs which import the resulting ones: if the user wants to
                    exclude a program, she obviously expects that the programs which require it
                    are excluded too.

        Args:
            criteria (List[Criterion]): A list of criteria, i.e., a mix of regular expression
                patterns (strings) and/or predicates (triples).
            operation (Operation): Either `"impart"`, `"hide"`, `"include"` or `"exclude"`.
            quantifier (str): Either `"any"` or `"all"`.
        """
        if operation in ("impart", "hide"):
            patterns = [str(criterion) for criterion in criteria]
            if operation == "impart":
                (program_set,
                 taxon_set) = self.programs_and_taxa_of_patterns(patterns)
                self.exclude_programs(program_set, follow=False)
                self.impart_taxa(taxon_set)
            else:
                (program_set,
                 taxon_set) = self.programs_or_taxa_of_patterns(patterns)
                self.hidden_programs.update(program_set)
                self.hidden_taxa.update(taxon_set)
        else:
            program_bag = self.programs_of_criteria(
                criteria, follow=(operation == "exclude"))
            if quantifier == "all":
                program_bag -= counter(
                    {program: len(criteria) - 1
                     for program in program_bag})
            if operation == "include":
                self.include_programs(set(program_bag))
            else:  # necessarily "exclude"
                self.exclude_programs(set(program_bag), follow=True)
Ejemplo n.º 24
0
def satisfiesF(L):
    """
    Assumes L is a list of strings
    Assume function f is already defined for you and it maps a string to a Boolean
    Mutates L such that it contains all of the strings, s, originally in L such
            that f(s) returns True, and no other elements. Remaining elements in L
            should be in the same order.
    Returns the length of L after mutation
    """
    data = counter(L)
    return data.most_common(L)
Ejemplo n.º 25
0
def main():
    n = int(input().rstrip())

    myset = list()

    for _ in range(0, n):
        myset.append(input().rstrip())

    print(len(set(myset)))
    qty = list(counter(myset).values())

    print(*qty, sep=" ")
Ejemplo n.º 26
0
 def _filter_atomic_property(self, ionization_data, selected_atoms):
     ionization_data["atomic_number"] = ionization_data.index.labels[0] + 1
     ionization_data["ion_number"] = ionization_data.index.labels[1] + 1
     ionization_data = ionization_data[ionization_data.atomic_number.isin(selected_atoms)]
     ion_data_check = counter(ionization_data.atomic_number.values)
     keys = np.array(ion_data_check.keys())
     values = np.array(ion_data_check.values())
     if np.alltrue(keys == values):
         return ionization_data
     else:
         raise IncompleteAtomicData(
             "ionization data for the ion (" + str(keys[keys != values]) + str(values[keys != values]) + ")"
         )
Ejemplo n.º 27
0
 def _filter_atomic_property(self, ionization_data, selected_atoms):
     ionization_data['atomic_number'] = ionization_data.index.labels[0] + 1
     ionization_data['ion_number'] = ionization_data.index.labels[1] + 1
     ionization_data = ionization_data[ionization_data.atomic_number.isin(
         selected_atoms)]
     ion_data_check = counter(ionization_data.atomic_number.values)
     keys = np.array(ion_data_check.keys())
     values = np.array(ion_data_check.values())
     if np.alltrue(keys == values):
         return ionization_data
     else:
         raise IncompleteAtomicData('ionization data for the ion (' +
                                    str(keys[keys != values]) +
                                    str(values[keys != values]) + ')')
Ejemplo n.º 28
0
    def programs_of_criteria(self, criteria: List[Criterion],
                             follow: bool) -> Counter[ProgramName]:
        """Calculate the set of programs that meet at least one of the criteria.

        Description:
            Each criterion may be either:

            - a string, which will be interpreted either as:
                - a program name pattern (ending with `".py"`). All programs matching it are
                    accumulated in the result;
                - or a taxon name pattern. All programs featuring at least one taxon matching it
                    are accumulated in the result. If the operation is `"exclude"`, this set is
                    extended to the programs which import (either directly or by transitivity) at
                    least one of its members;
            - a triple consisting in a “subject” pattern, a predicate (positive or negative)
                and an ”object” pattern. This predicate is normalized and, depending on its
                “sign”, evaluated on the patterns by either `ProgramFilter.programs_of_triple` or
                `ProgramFilter.programs_of_negated_triple`.

        Args:
            criteria (List[Criterion]): A list of criteria, i.e., a mix of regular expression
                patterns (strings) and/or predicates (triples).
            follow (bool): If `True`, extend the result with all the programs which import (either
                directly or by transitivity) at least one program meeting a criterion.

        Returns:
            Counter[ProgramName]: A bag (multiset) counting, for each resulting program, the number
                of criteria it meets.
        """
        resulting_programs: Counter[ProgramName] = counter()
        for criterion in criteria:
            if isinstance(criterion, str):  # the criterion is a pattern
                if criterion.endswith(
                        ".py"):  # the pattern is a program pattern
                    programs = self.programs_of_pattern(criterion)
                else:  # the pattern is a label pattern
                    taxa = self.taxa_of_pattern(criterion)
                    programs = self.programs_of_taxa(taxa, follow=follow)
                resulting_programs.update(programs)
            elif isinstance(criterion, (list, tuple)) and len(criterion) == 3:
                (pattern_1, raw_predicate, pattern_2) = criterion
                (predicate, negated) = normalize_predicate(raw_predicate)
                function = self.programs_of_negated_triple if negated else self.programs_of_triple
                resulting_programs.update(
                    function(pattern_1, predicate, pattern_2))
            else:
                print_warning(
                    f"criterion {repr(criterion)} cannot be included or excluded."
                )
        return resulting_programs
Ejemplo n.º 29
0
def spacy_tokenize_content(content, source_lang, setting):
    if source_lang == 'en':
        _spacy = spacy.load('en_core_web_sm')
    elif source_lang == 'de':
        _spacy = spacy.load('de_core_news_sm')
    elif source_lang == 'fr':
        _spacy = spacy.load('fr_core_news_sm')
    elif source_lang == 'es':
        _spacy = spacy.load('es_core_news_sm')

    spacy_content = _spacy(content)

    if setting == "to_words":
        token_set = [
            token.lemma_ for token in spacy_content if len(token) >= 3
        ]

        token_count = counter(token_set)

    elif setting == "to_sentences":
        token_set = [token.text for token in spacy_content.sents]
        token_count = counter(token_set)

    return token_count
Ejemplo n.º 30
0
def stats_textt_en(textt_en):
    # 统计每个英文单词出现的次数
    # 第一步:过滤英文字符,并将text拆分为list
    # 第二步:清理*-等标点符号
    # 第三步:使用collections库中的counter函数进行词频统计结果。
    result = re.sub("[^A-Za-z]", "", textt_en.strip())
    newList = result.split()
    x = 0
    for x in range(0, len(newList)):
        newList[x] = newList[x].strip("*-,.?!")
        if newList[x] == "":
            newList[x].remove('')
        else:
            x += 1
    print("英文单词次品统计结果:", collections.counter(newList), "\n")
Ejemplo n.º 31
0
def create_dictionary(clean_list):
    word_count = {}

    for word in clean_list:
        if word in word_count:
            word_count[word] += 1
        else:
            word_count[word] = 1

    for key, value in sorted(word_count.items(), key=operator.itemgetter(1)):
        print("% S : % S " % (key, value))

    c = counter(word_count)

    top = c.most_common(10)
    print(top)
Ejemplo n.º 32
0
def majorityElement(nums):
    n = len(nums)
    # print(n)
    # x = nums.count(nums)
    myCounter = counter(nums)
    # print(myCounter.items())
    halfN = n // 2
    # print(halfN)
    majorElement = 0
    for key, value in myCounter.items():
        # print(key ,'->', value)
        if (value > halfN):
            majorElement = key

    # print('majorElement = ',majorElement)
    return majorElement
Ejemplo n.º 33
0
def parse(textfile, countfile):
    if os.path.isfile(countfile):
        print("{} exists".format(countfile))
        return

    print("Loading {}".format(textfile))
    with open(textfile, "r") as f:
        data = f.read()

    data = map(lambda c: (" ", c.lower())[int(c.isalpha())], data)
    data = "".join(list(data)).split()
    data = counter(data)

    print("Writing {}".format(countfile))
    with open(countfile, "w") as f:
        print("word,count", file=f)
        for word, count in data.items():
            print("{},{}".format(word, count), file=f)
Ejemplo n.º 34
0
    def __fit_clusters(self, column: np.array) -> List[float]:
        """ Fit the clusters for a given feature.

        Arguments:
            column (np.array): All the values for a single feature.

        Returns:
            The cluster centers for this feature.
        """
        column = np.sort(column)
        distinct_counter = counter(column)
        max_clusters = sum(min(count, self.__min_cluster_size) for count in distinct_counter.values()) // \
                       self.__min_cluster_size
        for num_clusters in range(max_clusters, 0, -1):
            clustering = KMeansConstrained(n_clusters = num_clusters, size_min = self.__min_cluster_size,
                                           random_state = self.__random_generator)
            clusters = clustering.fit_predict(column[:, np.newaxis])
            if self.__correct_clustering(column, clusters):
                return self.__cluster_centers(column, clusters)
Ejemplo n.º 35
0
def train_classifier(classifier, cache, classifier_cache=None):
    """
    Train classifier with word2vec feature
    :param classifier: classifier
    :param cache: new cache path
    :param classifier_cache: path of classifier cache
    """
    if not classifier_cache:
        classifier_cache = get_classifier_cache(classifier)
    start_time = datetime.now()
    logger.info('Start training classifier: {}'.format(start_time))

    area_codes, feat = load_cache(cache,
                                  __CACHE_KEY_AREA_CODES__,
                                  __CACHE_KEY_FEATURE__)
    area_counter = counter(area_codes)
    logger.info('Training with {} samples'.format(len(area_codes)))
    logger.info('Area count: \n{}\n{}'.format(area_counter.keys(),
                                              area_counter.values()))

    encode_path = get_encoder_cache(len(area_counter.keys()))
    if os.path.exists(encode_path):
        label_encoder = pickle.load(open(encode_path, 'rb'))
        logger.info('load label_encoder from {}'.format(encode_path))
    else:
        label_encoder = LabelEncoder()
        label_encoder.fit(area_codes)
        pickle.dump(label_encoder, open(encode_path, 'wb'))
        logger.info('dump label encoder to {}'.format(encode_path))
    labels = label_encoder.transform(area_codes)

    classifier.fit(feat, labels)
    joblib.dump(classifier, classifier_cache)
    logger.info('classifier dump to: {}'.format(classifier_cache))

    end_time = datetime.now()
    logger.info('End training classifier: {}'.format(end_time))
    logger.info('Time elapsed: {}s'.format(
        (end_time - start_time).total_seconds()))
        for line in f:
            timeslot += 1
            line = line.strip()
            line = line.split(' ')
            if len(line) == 1:
                userId = line[0].split('-')[1]
                usersLocations[userId] = []
                runningLocation = None
            elif (line[1], line[2]) == runningLocation:
                pass
            else:
                runningLocation = (line[1], line[2])
                usersLocations[userId].append(runningLocation)

for k in usersLocations.keys():
    usersLocations[k] = counter(usersLocations[k])

############### Simulation Analysis Over ###############

print usersLocations.keys()[0]
print usersLocations.keys()[1]

user1Locations = []
with open('user1.txt', 'r') as f:
    l = f.readline()
    l = l.strip()
    l = l.split(' ')
    totalDays = 0
    print 'User 1:'
    print l[0]
    user1 = l[0]
from collections import Counter as counter

flexOptions = []
urban = []
urbansize = []
urbrur = []

with open('../../Data/NHTS/Unused_PERV2PUB.CSV', 'r') as f:
    header = f.readline()
    header = header.strip()
    header = header.split(',')
    #print header.index('FLEXTIME') # 40
    print header.index('URBAN')
    print header.index('URBRUR')
    print header.index('URBANSIZE')

    for line in f:
        line = line.strip()
        line = line.split(',')
        flexOptions.append(int(line[40]))
        urban.append(int(line[87]))
        urbansize.append(int(line[88]))
        urbrur.append(int(line[89]))

c = counter(flexOptions)
c1 = counter(urban)
c2 = counter(urbansize)
c3 = counter(urbrur)

urbanFlexOptions = [flexOptions[i] for i in range(len(flexOptions)) if urbrur[i]==1]
Ejemplo n.º 38
0
def sets(player):
    values = [int(k[:-1]) for k in player]
    values_counter = counter(values).items()
    values_counter.sort(key = lambda values: values[1])
    return values_counter
Ejemplo n.º 39
0
from urllib import urlopen
from bs4 import BeautifulSoup
site = urlopen("https://www.google.co.uk/finance").read()
soup = BeautifulSoup(site)

text = (soup.get_text())

from collections import counter
cnt = counter(text)

from urllib import urlopen
from bs4 import BeautifulSoup
site = urlopen("https://www.google.co.uk/finance").read()
soup = BeautifulSoup(site)

text = (soup.get_text())

from collections import counter
cnt = counter(text)

Ejemplo n.º 40
0
    for train_id in train: #check it against the info for every training instance
        train_instance = train[train_id]
        train_label = train_instance[0]
        labels.add(train_label)
        train_counts = train_instance[1]
        train_features = set(train_counts.keys())
        shared = test_features.intersection(train_features)
        if similarity == 1: # Euclidian distance measure
            nearest.append((train_label, euclidian(train_counts, test_counts, train_features, test_features, shared, train_id, test_id)))
        else: # Cosine distance measure
            nearest.append((train_label, cosine(train_counts, test_counts, shared, train_id, test_id)))
    if similarity == 1: # If Euclidian
        nearest = sorted(nearest, key=itemgetter(1))[:int(k_val)]
    else: # If cosine
        nearest = sorted(nearest, key=itemgetter(1), reverse=True)[:int(k_val)]
    projected_label = counter(item[0] for item in nearest).most_common(1)[0][0]
    test_confusion_matrix[test_label][projected_label] += 1
    sys_output.write("".join(["test:", str(instance_number), "\t", test_label, "\t"]))
    instance_number += 1
    # print the labels and votes to the sys_output
    votes = defaultdict(int)
    for tup in nearest:
        votes[tup[0]] += 1
    for c in labels:
        sys_output.write("\t"+c+"\t"+str(votes[c]))
    sys_output.write("\n")

#print the confusion matrix
print_matrix(test_confusion_matrix)

Ejemplo n.º 41
0
from decimal import *
import csv
from number import numberOf
from collections import counter

code=csv.reader(open('zipCodes.csv'),delimiter=',')
bor= csv.reader(open('boroughs.csv'),delimiter=',')
zipIncident= csv.reader(open('Incidents.csv'),delimiter=',')
zipIncident.next () 
cities = [row[1] for row in zipIncident] 
freq=[]

for (x,y) in counter (cities).iteritems():
	freq.append((x,y))
zip_pop= [] 
code.next()

for row in code:
	zip_pop.append ((row[1],row[10]))
zip_boroughs=[] 
bor.next()

for row in bor:
	zip_boroughs.append((row[0],row[1]))

#
incidents = dict(freq)
population = dict(zip_pop)
borough = dict(zip_boroughs)
            user2ObsLocationCount.append([])
        loc = (float(l[4]), float(l[5]))
        user2ObsLocationCount[-1].append(loc)
        user2ObsTripDistances.append(loc)
        runningDay = day
        runningTime = time
        totalDays += numdays
        if totalDays>=14:
            break

user2ObsLocationCount = [len(set(x)) for x in user2ObsLocationCount]
user2ObsTripDistances = getTripDistances(user2ObsTripDistances)

# Plots for Number of locations visited

c1_count = counter(user1ObsLocationCount)
total = sum(c1_count.values())
for k in c1_count:
    c1_count[k] = float(c1_count[k])/total

c1_simCount = counter(usersDailyLocationCount[user1])
total = sum(c1_simCount.values())
for k in c1_simCount:
    c1_simCount[k] = float(c1_simCount[k])/total

c2_count = counter(user2ObsLocationCount)
total = sum(c2_count.values())
for k in c2_count:
    c2_count[k] = float(c2_count[k])/total

c2_simCount = counter(usersDailyLocationCount[user2])
plt.xlabel('Time of day (hrs)')
plt.ylabel('Duration of activity (hrs)')
plt.title('"Other" activity characteristics indicated by CDR')
plt.savefig('3_Other')
plt.close()

# Categorize beginning timestamps of work in 10 minute windows and plot

divisor = 1.0/6
bucketedTimestamps = [None for x in workBeginTimestamps]

for i in range(len(workBeginTimestamps)):
    bucketedTimestamps[i] = workBeginTimestamps[i] - \
                             workBeginTimestamps[i]%divisor
    
c = counter(bucketedTimestamps)

x1 = c.keys()
y1 = c.values()

x = [a for (a,b) in sorted(zip(x1,y1))]
y = [b for (a,b) in sorted(zip(x1,y1))]
sumY = float(sum(y))
pOfY = [yy/sumY for yy in y]

plt.plot(x, pOfY)
plt.xlabel('Start Time [h]')
plt.ylabel('Frequency')
plt.xticks(range(0,25,6))
plt.xlim(0,24)
plt.savefig('StartTimeFrequency')
    stopped_tokens = [i for i in tokens if not i in en_stop]

    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]

    # add tokens to list
    texts.append(stemmed_tokens)

texts_final = []

for i in texts:
    for j in range(len(i) - 1):
        #print (i[j])
        texts_final.append(i[j] + '_' + i[j+1])
    #texts_final.append(text_temp)
texts_final = counter(texts_final)
   
dictWords = texts_final.most_common()
texts_final = pd.DataFrame(dictWords)
texts_final.to_csv('WordCount_gent.csv',encoding='utf-8')


# In[23]:

def char_ldamodel(reviewList):
    
    corpus = create_corpus(reviewList)

    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=10, id2word = dictionary, passes=num_passes)

    return ldamodel
plt.xlabel('Time of day (hrs)')
plt.ylabel('Duration of activity (hrs)')
plt.title('"Work" activity characteristics indicated by MASS')
plt.savefig('../../Figures/TSvsDur_MASS_Cont')
plt.close()


totalCount = 0
for c in counts:
    for x in c:
        totalCount+=x
        
with open('../../ProcessedData/TSvsDur_MASS.txt', 'w') as f:
    for c in counts:
        l = [str(x/totalCount) for x in c]
        l = ' '.join(l)
        f.write(l+'\n')


roundedStartTimes = [round(x) for x in arrivalTimes]
c = counter(roundedStartTimes)
total = len(arrivalTimes)
count24 = c.pop(24)
c[0] += count24

with open('../../ProcessedData/TSvsDurHrPerc_Mass.txt', 'w') as f:
    for k in sorted(c.keys()):
        f.write(str(k)+ ' ' + str(100*float(c[k])/total)+'\n')


    for line in f:
        line = line.strip()
        line = line.split(' ')
        if line[2]=='1':
            numWorkers+=1
        if line[9]=='1':
            numWorking+=1
            startTime = float(line[10])
            endTime = startTime+float(line[11])
            startTimes.append(startTime)
            endTimes.append(endTime)

startTimes = [int(x/6) for x in startTimes]
endTimes = [int(x/6) for x in endTimes]

c1 = counter(startTimes)
c2 = counter(endTimes)

x = range(24)
y = [0]

for t in x:
    working = y[-1]+c1[t]-c2[t]
    y.append(float(working))

y = y[1:]
y = [yy/numWorking for yy in y]
plt.plot(x,y)
plt.xlabel('Time of day')
plt.ylabel('Fraction of active workers')
plt.xlim(0,24)
Ejemplo n.º 47
0
def isDrift(line):
    minStrains = 2
    return sorted(counter(line).items(), key=lambda x: x[1], reverse=True)[1][1] <= minStrains #this last value is the number of required strains + 1
Ejemplo n.º 48
0
                    allWorkersWorkDays.extend(workDays)
                    workDays = []
                else:
                    continue

if workerBeingProcessed == 1:
    workerBeingProcessed = 0
    workersProcessed += 1
    worker = workers[workersProcessed]
    workDays = list(set(workDays))
    allWorkersWorkDays.extend(workDays)
    workDays = []

print workersProcessed

c = counter(allWorkersWorkDays)

k = [x for (x, y) in sorted(zip(c.keys(), c.values()))]
v = [y for (x, y) in sorted(zip(c.keys(), c.values()))]
d = [x.weekday() for x in k]
for i in range(len(d)):
    if d[i] == 0:
        d[i] = "M"
    elif d[i] == 1:
        d[i] = "T"
    elif d[i] == 2:
        d[i] = "W"
    elif d[i] == 3:
        d[i] = "T"
    elif d[i] == 4:
        d[i] = "F"
Ejemplo n.º 49
0
def readGenome(filename):
    genome = ''
    with open(filename, 'r') as f:    #opened as read only
        for line in f:
            if not line[0] == '>':
                genome += line.rstrip()
    return genome
genome = readGenome('lambda_virus.fa')

counts = {'A':0, 'C':0, 'G':0, 'T':0, 'N':0}
for base in genome:
    counts[base] +=1
print (counts)

import collections
collections.counter(genome)

!wget --no-check https://d28rh4a8wq0iu5.cloudfront.net/ads1/data/SRR835775_1.first1000.fastq
def readFastq(filename):
    sequences = []
    qualities = []
    with open(filename) as fh:
        while True:
            fh.readline()
            seq = fh.readline().rstrip()
            fh.readline()
            qual = fh.readline().rstrip()
            if len(seq) == 0:
                break
            sequences.append(seq)
            qualities.append(qual)
Ejemplo n.º 50
0
# initialize and load in data
grammar = defaultdict(list)
train = open(train).read().strip().split("\n")
sentences = open(sentences, 'r').read().strip().split("\n")

for s in train:
    for rule in Tree(s).productions():
        grammar[rule.lhs()].append(rule.rhs())

# create pcfg the key is the lhs of the rule,
# the value is a dictionary where the key is a tuple of the RHS
# and the value is the prob for that RHS
pcfg = defaultdict(dict)
for left in grammar:
    for k, v in counter(grammar[left]).most_common():
        pcfg[left][k] = v/float(len(grammar[left]))

inversePCFG = defaultdict(list)

for key in pcfg:
    for value in pcfg[key]:
        inversePCFG[value].append((key, pcfg[key][value]))

#output trained grammar
grammar_out = open(grammar_out, 'w')
for left in pcfg:
    for right in pcfg[left]:
        grammar_out.write(" ".join([str(left), "->", " ".join([str(item) for item in right]), "["+str(pcfg[left][right])+"]", "\n"]))
grammar_out.close()
Ejemplo n.º 51
0
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler

data = np.array(X)
db = DBSCAN(eps=0.25, min_samples=80).fit(data)
core_samples = db.core_sample_indices_
labels = db.labels_
numClusters = len(set(labels)) - (1 if -1 in labels else 0)
previousClusterCount = numClusters

clusters = [labels == k for k in xrange(numClusters)]

from collections import Counter as counter
print counter(labels)

def getDbscanColor(label):
    if label=='2':
        return 'g'
    elif label=='0':
        return 'b'
    elif label=='1':
        return 'r'
    elif label=='-1':
        return 'k'
    elif label=='3':
        return 'orange'
    elif label=='4':
        return 'magenta'
        m = startTime%100
        h = startTime/100
        startTime = h + float(m)/60

        dwellTime = float(line[89])/60
        if dwellTime>=0 and startTime>0:    # Considering work longer than 2 hour
            dwellTimesHbo.append(dwellTime)
            startTimesHbo.append(startTime)

f.close()

startTimesNhb = [int(x) for x in startTimesNhb]
startTimesHbo = [int(x) for x in startTimesHbo]
startTimesHbw = [int(x) for x in startTimesHbw]

hourlyCountNhb = counter(startTimesNhb)
hourlyCountHbo = counter(startTimesHbo)
hourlyCountHbw = counter(startTimesHbw)

plt.xlim(0,24)
x1 = sorted(hourlyCountNhb.keys())
y1 = [hourlyCountNhb[x] for x in sorted(hourlyCountNhb.keys())]

x2 = sorted(hourlyCountHbo.keys())
y2 = [hourlyCountHbo[x] for x in sorted(hourlyCountHbo.keys())]

x3 = sorted(hourlyCountHbw.keys())
y3 = [hourlyCountHbw[x] for x in sorted(hourlyCountHbw.keys())]

plt.plot(x1, y1, color='b',  marker='o', label='NHB')
plt.plot(x2, y2, color='r',  marker='o', label='HBO')
Ejemplo n.º 53
0
for t in tractInfo:
    for i in range(len(timeIntervals)):
        if t[1] <= timeIntervals[i][0] and t[2] >= timeIntervals[i][1]:
            occupancy[i] += 1
        if t[0] == "h":
            if t[1] <= timeIntervals[i][0] and t[2] >= timeIntervals[i][1]:
                occupancyHome[i] += 1
        elif t[0] == "w":
            if t[1] <= timeIntervals[i][0] and t[2] >= timeIntervals[i][1]:
                occupancyWork[i] += 1
        elif t[0] == "o":
            if t[1] <= timeIntervals[i][0] and t[2] >= timeIntervals[i][1]:
                occupancyOther[i] += 1

occupantType = [t[0] for t in tractInfo]
c = counter(occupantType)

xCoords = [t[0] for t in timeIntervals]
plt.plot(xCoords, occupancy, color="b", label="Total")
plt.plot(xCoords, occupancyHome, color="r", label="Home")
plt.plot(xCoords, occupancyWork, color="g", label="Work")
plt.plot(xCoords, occupancyOther, color="orange", label="Other")

plt.legend(loc="lower left", prop={"size": 11})
plt.xlabel("Time of day (hours)")
plt.ylabel("Tract Occupancy")
plt.title("Occupancy profile for tract " + TRACT)

plt.savefig(TRACT)
plt.close()
        [c, l] = tractColorAndLabel(tractInfo[tract])#Info[tract]/maxExpFactor
        x = tractShape[tract][0]
        y = tractShape[tract][1]
        plt.fill(x,y,color=str(c), label=l if l not in addedLabels else '')
        found+=1
        plottedExpFactors.append(tractInfo[tract])
        if l not in addedLabels:
            addedLabels.append(l)
    except:
        notFound+=1

plt.legend(loc='lower left', prop={'size':11})
plt.savefig('censusTractExpansionFactors', dpi=500)
plt.close()

expansionFactors = tractInfo.values()
#int5ExpansionFactors = [x-x%5 for x in expansionFactors if x<250]
int5ExpansionFactors = [x-x%5 for x in plottedExpFactors if x<250 and x>0]
c = counter(int5ExpansionFactors)
k = c.keys()
v = c.values()
x = [X for (X,Y) in sorted(zip(k,v))]
y = [Y for (X,Y) in sorted(zip(k,v))]
total = sum(y)
y = [float(a)/total for a in y]
plt.plot(x, y, marker='o')
plt.xlabel('Expansion factor, f')
plt.ylabel('P(f)')
plt.savefig('pdfExpansionFactors')
plt.close()