def build_feature(df): words = get_words(df) counts = Counter(words) weights = {word: get_weight(count) for word, count in counts.items()} df['word_shares'] = df.apply(lambda x: word_shares(x, weights), axis=1) x = get_feature(df) x = x.fillna(0) return x
def getIndividualSupport(dataDictionary, totalTransactions, threshold=2): dataDict = Counter(sorted(dataDictionary)) cleanedDict = [item for item in dataDict.items() if item[1] >= threshold] supportList = [] for item in cleanedDict: if item[1] >= threshold: supportList.append((item[0], item[1] / totalTransactions)) return supportList
def numJewelsInStones(self, jewels: str, stones: str) -> int: freqs = Counter(stones) count = 0 for char in jewels: count += freqs[char] return count
def count_distances(coords: List[Coord], treshold: int) -> int: grid = build_grid(coords) total_distance = Counter() for loc in grid.all_locations(): for coord in coords: total_distance[loc] += loc.distance(coord) values = total_distance.values() return len([v for v in values if v < treshold])
def __post_init__(self): counter = Counter(item.type.value for item in self.items) for key, count in counter.items(): if not self.top_level and count > 1 and key not in PLURAL_KEYS: raise KeyError( f'Key "{key}" already exists in tree and would overwrite the ' "existing value." )
def _used_properties_with_type( self, property_type: PropertyType) -> Counter[PropertyIdentifier]: return Counter({ (name, type, group_type_index): count for (name, type, group_type_index ), count in self.properties_used_in_filter.items() if type == property_type })
def __init__(self, parallelism: int = PARALLELISM): super().__init__() self.parallelism: int = parallelism self.queued_tasks: OrderedDict[TaskInstanceKey, QueuedTaskInstanceType] = OrderedDict() self.running: Set[TaskInstanceKey] = set() self.event_buffer: Dict[TaskInstanceKey, EventBufferValueType] = {} self.attempts: Counter[TaskInstanceKey] = Counter()
def customSortString_counter(self, order: str, s: str) -> str: counter, ans = Counter(s), "" for c in order: if c in counter: ans += c * counter[c] counter.pop(c) return ans + "".join(c * counter[c] for c in counter)
def modify_ner_data(data: str): root_dir = './data/onto' dataset = data + '.corpus' match_counter, conti_counter, not_match_counter = Counter(), Counter(), Counter() match, conti_match, not_match = 0, 0, 0 trees = load_trees(os.path.join(root_dir, 'parsing_char', dataset)) ner_snts, ner_golds = load_data_from_file(os.path.join(root_dir, 'ner_char', dataset)) assert len(trees) == len(ner_snts) with open(os.path.join(root_dir, 'ner_char', dataset), 'w', encoding='utf-8') as ner_writer: for snt, ner_gold, tree in zip(ner_snts, ner_golds, trees): snt, ner_gold = snt.split(), ner_gold.split() assert len(list(tree.leaves())) == len(snt) spans = _bio_tag_to_spans(ner_gold) for span in spans: match_type = tree.ner_match(span[1][0], span[1][1], False, False) if match_type == 0: not_match += 1 # ====================================== for i in range(span[1][0], span[1][1]): ner_gold[i] = 'O' # ====================================== not_match_counter.update([span[0]]) elif match_type == 1: match_counter.update([span[0]]) match += 1 elif match_type == 2: conti_counter.update([span[0]]) conti_match += 1 else: print('error') exit(-1) # ====================================== assert len(snt) == len(ner_gold) for char, ner in zip(snt, ner_gold): ner_writer.write(char+'\t'+ner+'\n') ner_writer.write('\n') # ====================================== print(match, conti_match, not_match, not_match/(not_match+match+conti_match)) print(match_counter) print(conti_counter) print(not_match_counter)
def minSetSize_counter_and_sort(self, arr: List[int]) -> int: res, half = 0, len(arr) // 2 c = Counter(arr) for i in sorted(c.values(), reverse=True): half -= i res += 1 if half <= 0: break return res
def most_prolific_automaker(year: str) -> str: """Given year 'year' return the automaker that released the highest number of new car models.""" automakers_by_year = [ item.get("automaker") for item in DATA if item.get("year") == year ] automaker_frequencies = Counter(automakers_by_year) (automaker, _), *_ = automaker_frequencies.most_common() return automaker
def customSortString1(self, S: str, T: str) -> str: count = Counter(T) answer = '' for s in S: answer += s * count[s] count[s] = 0 for c in count: answer += c * count[c] return answer
def main(): st = str_r() ctr = dict(Counter(st)) total = factorial(len(st)) for _, val in ctr.items(): if val > 1: total = total // factorial(val) outStr("{}\n".format(total)) outStr("\n".join(sorted(set(["".join(p) for p in permutations(st)]))))
def minimumHammingDistance(self, source: List[int], target: List[int], allowedSwaps: List[List[int]]) -> int: uf = UnionFind(len(source)) for s in allowedSwaps: uf.union(s[0], s[1]) groups = defaultdict(lambda: {"s": [], "t": []}) for i, (s, t) in enumerate(zip(source, target)): parent = uf.find(i) groups[parent]["s"].append(s) groups[parent]["t"].append(t) ret = 0 for group in groups.values(): ret += sum((Counter(group["s"]) - Counter(group["t"])).values()) return ret
def haskell_loc(directory: Path) -> Counter[str]: total_count: Counter[str] = Counter() for filepath in directory.glob("src/**/*.hs"): if should_exclude_file(filepath): continue with open(filepath, "r") as f: lines = f.readlines() total_count += count_haskell_loc(lines) return total_count
def minWindow(self, s: str, t: str) -> str: t_count = Counter(t) current_count = Counter() start = float('-inf') end = float('inf') left = 0 for right, char in enumerate(s, 1): current_count[char] += 1 while current_count & t_count == t_count: if right - left < end - start: start, end = left, right current_count[s[left]] -= 1 left += 1 return s[start:end] if end - start <= len(s) else ''
def leastInterval(self, tasks: List[str], n: int) -> int: counter = Counter(tasks) result = 0 while True: sub_count = 0 for task, _ in counter.most_common(n + 1): sub_count += 1 result += 1 counter.subtract(task) counter += Counter() if not counter: break result += n - sub_count + 1 return result
def __create_probability_table(symbol_list): frequency_table = Counter(symbol_list) N = sum(frequency_table.values()) assert N != 0 return { symbol: Decimal(freq) / N for symbol, freq in frequency_table.items() }
def genome_stat(top: str, genomes: list, fna_file: str, output=False) -> None: genome_stat_lis = [] for genome in genomes: path = os.path.join(top, genome, fna_file) headers = [] header_len_lis = [] big_str_len = 0.0 with open(path) as f: header_len = 0 for line in f: if line.startswith('>'): header_len_lis.append(header_len) header_len = 0 headers.append(line.split(' ')[0]) else: big_str_len += len(line.replace('\n', '')) header_len += len(line.replace('\n', '')) del (header_len_lis[0]) #test contig dup hdic = dict(Counter(headers)) dup_lis = [key for key, value in hdic.items() if value > 1] dup_dic = {key: value for key, value in hdic.items() if value > 1} avg_len = float(big_str_len) / float(len(headers)) if dup_lis == [] and dup_dic == {}: genome_stat_lis.append( (genome, len(headers), avg_len, header_len_lis)) else: print(genome + ': failed!') genome_stat_lis = sorted(genome_stat_lis, key=lambda x: x[2], reverse=True) # print(genome_stat_lis) if output: with open('genome_stat.txt', 'w') as out: for genome_pair in genome_stat_lis: genome = genome_pair[0] contigs = genome_pair[1] avg_len = genome_pair[2] header_len_lis = genome_pair[3] len_lis = [1000, 2000, 5000, 10000, 20000, 50000] len_dict = stat_by_len(len_lis, header_len_lis) print(genome, end='') print("\tcontigs: " + str(contigs) + "\tavg_len: " + str(avg_len)) out.writelines(genome) out.writelines("\tcontigs: " + str(contigs) + "\tavg_len: " + str(avg_len)) # out.writelines("\tmax = " + str(max_len) + "\tmin = " + str(min_len)) out.writelines(str(len_dict) + '\n')
def next_state_(state: State) -> State: count: Counter[Tuple[int, ...]] = Counter() for point in state: for delta in product([-1, 0, 1], repeat=len(point)): if not any(delta): continue neighbor = tuple((sum(d) for d in zip(point, delta))) count[neighbor] += 1 return {p for p, c in count.items() if c == 3 or c == 2 and p in state}
def gather_stats_good( n: int, samples: int = 1000, summary: Optional[Counter[int]] = None ) -> Counter[int]: if summary is None: summary = Counter() summary.update( sum(randint(1, 6) for d in range(n)) for _ in range(samples)) return summary
def _commandline_discovery_on_host( host_name: HostName, ipaddress: Optional[HostAddress], parsed_sections_broker: ParsedSectionsBroker, run_plugin_names: Container[CheckPluginName], only_new: bool, *, load_labels: bool, only_host_labels: bool, on_error: OnError, ) -> None: section.section_step("Analyse discovered host labels") host_labels = analyse_node_labels( host_name=host_name, ipaddress=ipaddress, parsed_sections_broker=parsed_sections_broker, load_labels=load_labels, save_labels=True, on_error=on_error, ) count = len(host_labels.new) if host_labels.new else ( "no new" if only_new else "no") section.section_success(f"Found {count} host labels") if only_host_labels: return section.section_step("Analyse discovered services") service_result = analyse_discovered_services( host_name=host_name, ipaddress=ipaddress, parsed_sections_broker=parsed_sections_broker, run_plugin_names=run_plugin_names, only_new=only_new, on_error=on_error, ) # TODO (mo): for the labels the corresponding code is in _host_labels. # We should put the persisting in one place. autochecks.save_autochecks_file(host_name, service_result.present) new_per_plugin = Counter(s.check_plugin_name for s in service_result.new) for name, count in sorted(new_per_plugin.items()): console.verbose("%s%3d%s %s\n" % (tty.green + tty.bold, count, tty.normal, name)) count = len(service_result.new) if service_result.new else ( "no new" if only_new else "no") section.section_success(f"Found {count} services") for detail in check_parsing_errors( parsed_sections_broker.parsing_errors()).details: console.warning(detail)
def topKFrequent(self, nums: List[int], k: int) -> List[int]: counts = Counter(nums).most_common() res = [] for i in range(k): res.append(counts[i][0]) return res
def frequencySort(self, s: str) -> str: # Counter로 aggregate 하고 counter = Counter(s).most_common() ans = [] # count 만큼 곱해서 append로 모은 후 for ch, cnt in counter: ans.append(ch * cnt) # join 하여 반환 return ''.join(ans)
def calculate_template_score(template): """Calculate the score of an element template. The score is the count of the most frequent element minus the count of the least frequent. """ c = Counter(template) max_el = max(c, key=lambda x: c[x]) min_el = min(c, key=lambda x: c[x]) return c[max_el] - c[min_el]
def numberOfArithmeticSlices(self, A): # dp Time: O(n^2) Space: O(n^2) total, n = 0, len(A) dp = [Counter() for item in A] for i in range(n): for j in range(i): diff = A[i] - A[j] dp[i][diff] += dp[j][diff] + 1 total += sum(dp[i].values()) return total - (n - 1) * n // 2
def __init__(self, bot): self.bot = bot self.config = Config.get_conf(self, 1398467138476, force_registration=True) default_global = { "globaldata": Counter({}), "guilddata": {}, "automated": Counter({}) } self.config.register_global(**default_global) self.cache = { "guild": {}, "session": Counter({}), "automated": Counter({}) } self.session = Counter() self.session_time = datetime.datetime.utcnow() self.bg_loop_task = self.bot.loop.create_task(self.bg_loop())
def fit(self, values): trigram = [["".join(x) for x in list(xngrams(val, 3))] for val in values] ngrams = list(itertools.chain.from_iterable(trigram)) self.trigram_counter = Counter(ngrams) sym_ngrams = [str2regex(x, False) for x in ngrams] self.sym_trigram_counter = Counter(sym_ngrams) self.val_counter = Counter(values) sym_values = [str2regex(x, False) for x in values] self.sym_val_counter = Counter(sym_values) self.func2counter = { val_trigrams: self.trigram_counter, sym_trigrams: self.sym_trigram_counter, value_freq: self.val_counter, sym_value_freq: self.sym_val_counter, }
def counting(apps: List[str], runningapps: List[str]) -> float: running_apps = [] n = 0 while True: [running_apps.append(i) for i in runningapps] n += 60 if n == 360: return Counter(running_apps) break sleep(1)
def main(): class1 = ["Bob", "Jams", "Jams"] class2 = ["Bill", "Jams", "Joe"] c1 = Counter(class1) c2 = Counter(class2) print(c1["Jams"]) print(sum(c1.values()), "in clas 1") c1.update(class2) print(sum(c1.values()), "in clas 1") print(c1.most_common(3)) c1.subtract(class2) print(c1.most_common(3)) print(c1 & c2)