def dedup_percentile(Counter): """ Remove UMIs with counts lower than 1% of the mean """ threshold = np.mean(list(Counter.values())) / 100 return len( [umi for umi in list(Counter.keys()) if Counter[umi] > threshold])
def birthdayCakeCandles(ar): # we can use the Counter dictionary from the collections library which keys are the values in the # list and the values are the number of occurrences d = Counter(ar) largest_key = max(Counter.keys(d)) print(largest_key) # select the key's value with the max key in the d dictionary return d.get(largest_key)
def dedup_adj(Counter, mismatches=1): def get_adj_list_adjacency(umis): return { umi: [umi2 for umi2 in umis if edit_dist(umi, umi2) <= mismatches] for umi in umis } def get_connected_components_adjacency(graph, Counter): found = list() components = list() for node in sorted(graph, key=lambda x: Counter[x], reverse=True): if node not in found: component = breadth_first_search(node, graph) found.extend(component) components.append(component) return components def remove_umis(adj_list, cluster, nodes): '''removes the specified nodes from the cluster and returns the remaining nodes ''' # list incomprehension: for x in nodes: for node in adj_list[x]: yield node nodes_to_remove = set([node for x in nodes for node in adj_list[x]] + nodes) return cluster - nodes_to_remove def get_best_adjacency(cluster, adj_list, counts): if len(cluster) == 1: return list(cluster) sorted_nodes = sorted(cluster, key=lambda x: counts[x], reverse=True) for i in range(len(sorted_nodes) - 1): if len(remove_umis(adj_list, cluster, sorted_nodes[:i + 1])) == 0: return sorted_nodes[:i + 1] def reduce_clusters_adjacency(adj_list, clusters, counts): # TS - the "adjacency" variant of this function requires an adjacency # list to identify the best umi, whereas the other variants don't # As temporary solution, pass adj_list to all variants n = 0 for cluster in clusters: parent_umis = get_best_adjacency(cluster, adj_list, counts) n += len(parent_umis) return n adj_list = get_adj_list_adjacency(list(Counter.keys())) clusters = get_connected_components_adjacency(adj_list, Counter) count = reduce_clusters_adjacency(adj_list, clusters, Counter) return count
def dedup_cluster(Counter, mismatches=1): def get_adj_list_cluster(umis): return { umi: [umi2 for umi2 in umis if edit_dist(umi, umi2) <= mismatches] for umi in umis } def get_connected_components_cluster(graph, Counter): found = list() components = list() for node in sorted(graph, key=lambda x: Counter[x], reverse=True): if node not in found: component = breadth_first_search(node, graph) found.extend(component) components.append(component) return components adj_list = get_adj_list_cluster(list(Counter.keys())) clusters = get_connected_components_cluster(adj_list, Counter) return len(clusters)
def dedup_dir_adj(Counter, mismatches=1): def get_adj_list_directional_adjacency(umis, counts): return { umi: [ umi2 for umi2 in umis if edit_dist(umi, umi2) <= mismatches and counts[umi] >= (counts[umi2] * 2) - 1 ] for umi in umis } def get_connected_components_adjacency(graph, Counter): found = list() components = list() for node in sorted(graph, key=lambda x: Counter[x], reverse=True): if node not in found: component = breadth_first_search(node, graph) found.extend(component) components.append(component) return components def remove_umis(adj_list, cluster, nodes): '''removes the specified nodes from the cluster and returns the remaining nodes ''' # list incomprehension: for x in nodes: for node in adj_list[x]: yield node nodes_to_remove = set([node for x in nodes for node in adj_list[x]] + nodes) return cluster - nodes_to_remove def reduce_clusters_directional_adjacency(adj_list, clusters, counts): n = 0 for cluster in clusters: n += 1 return n adj_list = get_adj_list_directional_adjacency(list(Counter.keys()), Counter) clusters = get_connected_components_adjacency(adj_list, Counter) count = reduce_clusters_directional_adjacency(adj_list, clusters, Counter) return count
def plot_data(d: collections.Counter, figure_path: pathlib.Path) -> None: """ Plots a line plot for the given Counter object and saves the figure to a file. :param d: Counter object of timestamps to plot :param figure_path: Path of file to save the plot in. """ x = list(d.keys()) y = list(d.values()) plt.plot(x, y) plt.xlabel("Date") plt.ylabel("Tweets received") plt.gca().xaxis.set_major_formatter(md.DateFormatter("%Y-%m-%d %H:%M")) plt.savefig(figure_path) plt.show()
def dtree2(tbl, rows=None, lvl=-1, asIs=10**32, up=None, klass=-5, branch=[], f=None, val=None, opt=None, encode=True): """ Discrete independent variables """ if not opt: opt = Thing(min=1, maxLvL=10, infoPrune=1, klass=-1, prune=True, debug=True, verbose=True) features = fWeight(tbl) # if encode==True: # encode(tbl, features, opt=opt) here = Thing(t=tbl, kids=[], f=f, val=val, up=up, lvl=lvl, rows=rows, modes={}, branch=branch) if opt.prune and lvl < 0: features = fWeight(tbl)[:int(len(features) * opt.infoPrune)] name = features.pop(0) remaining = tbl[features + [tbl.columns[opt.klass]]] feature = tbl[name].values klass = tbl[tbl.columns[opt.klass:]].values N = len(klass) here.score = np.mean(klass, axis=0) # splits = discretize(feature, klass, discrete=True) LO, HI = min(feature), max(feature) def pairs(lst): while len(lst) > 1: yield (lst.pop(0), lst[0]) cutoffs = [LO, HI] # set_trace() if lvl > (opt.maxLvL if opt.prune else int(len(features) * opt.infoPrune)): return here if asIs < 0.1: return here if len(features) < 1: return here def rows(): for span in cutoffs: new = [] for f, row in zip(feature, remaining.values.tolist()): if f == span: new.append(row) yield pd.DataFrame(new, columns=remaining.columns), span ent = lambda x: sum([ -Counter[n] / len(x) * np.log(Counter[n] / len(x)) for n in Counter.keys() ]) sdv = lambda x: np.mean(np.var(x, axis=0)) for child, span in rows(): # set_trace() n = child.shape[0] toBe = sdv(child[child.columns[opt.klass]]) if opt.min <= n < N: here.kids += [ dtree2(child, lvl=lvl + 1, asIs=toBe, up=here, branch=branch + [(name, span)], f=name, val=(span, span), opt=opt, encode=False) ] return here
def dedup_unique(Counter): """ Count all unique UMIs """ return len(Counter.keys())
i = 0 if len(data) > 0: while (i < len(data)): row = data[i] prefix = row[0] IPversion = row[2] CC = row[3] route_collector_extracted = row[1] for ixp in IXP_collector.keys( ): #delete de 2 for the complete continent if ixp not in Counter.keys(): Counter[ixp] = {} for route_collector in IXP_collector[ ixp]: #delete de 2 for the complete continent print ixp, route_collector if str(route_collector) == str( route_collector_extracted): if CC not in Counter[ixp].keys(): Counter[ixp][CC] = {} if IPversion not in Counter[ixp][CC].keys(): Counter[ixp][CC][IPversion] = []
def birthdayCakeCandles(n, ar): d = Counter(ar) largest_key = max(Counter.keys(d)) return d.get(largest_key)
while (i<len(data)): row = data[i] prefix = row[0] IPversion = row[2] CC = row[3] route_collector_extracted = row[1] for ixp in IXP_collector.keys(): #delete de 2 for the complete continent for route_collector in IXP_collector[ixp]: #delete de 2 for the complete continent print ixp, route_collector if str(route_collector) == str(route_collector_extracted): if CC not in Counter.keys(): Counter[CC] = {} if IPversion not in Counter[CC].keys(): Counter[CC][IPversion] = [] if prefix not in Counter[CC][IPversion]: Counter[CC][IPversion].append(prefix) else: print 'RouteCollector ', route_collector, 'does not appear in table Data__'+str(int(window[0]))+"_"+str(int(window[1])) i += 1 else: print "Table doesn't exist: Data__"+str(year)+"_"+str(month)