Ejemplo n.º 1
0
def dedup_percentile(Counter):
    """
    Remove UMIs with counts lower than 1% of the mean
    """
    threshold = np.mean(list(Counter.values())) / 100
    return len(
        [umi for umi in list(Counter.keys()) if Counter[umi] > threshold])
Ejemplo n.º 2
0
def birthdayCakeCandles(ar):
    # we can use the Counter dictionary from the collections library which keys are the values in the
    # list and the values are the number of occurrences
    d = Counter(ar)
    largest_key = max(Counter.keys(d))
    print(largest_key)
    # select the key's value with the max key in the d dictionary
    return d.get(largest_key)
Ejemplo n.º 3
0
def dedup_adj(Counter, mismatches=1):
    def get_adj_list_adjacency(umis):
        return {
            umi: [umi2 for umi2 in umis if edit_dist(umi, umi2) <= mismatches]
            for umi in umis
        }

    def get_connected_components_adjacency(graph, Counter):
        found = list()
        components = list()
        for node in sorted(graph, key=lambda x: Counter[x], reverse=True):
            if node not in found:
                component = breadth_first_search(node, graph)
                found.extend(component)
                components.append(component)
        return components

    def remove_umis(adj_list, cluster, nodes):
        '''removes the specified nodes from the cluster and returns
        the remaining nodes '''
        # list incomprehension: for x in nodes: for node in adj_list[x]: yield node
        nodes_to_remove = set([node for x in nodes
                               for node in adj_list[x]] + nodes)
        return cluster - nodes_to_remove

    def get_best_adjacency(cluster, adj_list, counts):
        if len(cluster) == 1:
            return list(cluster)
        sorted_nodes = sorted(cluster, key=lambda x: counts[x], reverse=True)
        for i in range(len(sorted_nodes) - 1):
            if len(remove_umis(adj_list, cluster, sorted_nodes[:i + 1])) == 0:
                return sorted_nodes[:i + 1]

    def reduce_clusters_adjacency(adj_list, clusters, counts):
        # TS - the "adjacency" variant of this function requires an adjacency
        # list to identify the best umi, whereas the other variants don't
        # As temporary solution, pass adj_list to all variants
        n = 0
        for cluster in clusters:
            parent_umis = get_best_adjacency(cluster, adj_list, counts)
            n += len(parent_umis)
        return n

    adj_list = get_adj_list_adjacency(list(Counter.keys()))
    clusters = get_connected_components_adjacency(adj_list, Counter)
    count = reduce_clusters_adjacency(adj_list, clusters, Counter)
    return count
Ejemplo n.º 4
0
def dedup_cluster(Counter, mismatches=1):
    def get_adj_list_cluster(umis):
        return {
            umi: [umi2 for umi2 in umis if edit_dist(umi, umi2) <= mismatches]
            for umi in umis
        }

    def get_connected_components_cluster(graph, Counter):
        found = list()
        components = list()
        for node in sorted(graph, key=lambda x: Counter[x], reverse=True):
            if node not in found:
                component = breadth_first_search(node, graph)
                found.extend(component)
                components.append(component)
        return components

    adj_list = get_adj_list_cluster(list(Counter.keys()))
    clusters = get_connected_components_cluster(adj_list, Counter)
    return len(clusters)
Ejemplo n.º 5
0
def dedup_dir_adj(Counter, mismatches=1):
    def get_adj_list_directional_adjacency(umis, counts):
        return {
            umi: [
                umi2 for umi2 in umis if edit_dist(umi, umi2) <= mismatches
                and counts[umi] >= (counts[umi2] * 2) - 1
            ]
            for umi in umis
        }

    def get_connected_components_adjacency(graph, Counter):
        found = list()
        components = list()
        for node in sorted(graph, key=lambda x: Counter[x], reverse=True):
            if node not in found:
                component = breadth_first_search(node, graph)
                found.extend(component)
                components.append(component)
        return components

    def remove_umis(adj_list, cluster, nodes):
        '''removes the specified nodes from the cluster and returns
        the remaining nodes '''
        # list incomprehension: for x in nodes: for node in adj_list[x]: yield node
        nodes_to_remove = set([node for x in nodes
                               for node in adj_list[x]] + nodes)
        return cluster - nodes_to_remove

    def reduce_clusters_directional_adjacency(adj_list, clusters, counts):
        n = 0
        for cluster in clusters:
            n += 1
        return n

    adj_list = get_adj_list_directional_adjacency(list(Counter.keys()),
                                                  Counter)
    clusters = get_connected_components_adjacency(adj_list, Counter)
    count = reduce_clusters_directional_adjacency(adj_list, clusters, Counter)
    return count
Ejemplo n.º 6
0
def plot_data(d: collections.Counter, figure_path: pathlib.Path) -> None:
    """
    Plots a line plot for the given Counter object
    and saves the figure to a file.

    :param d:
        Counter object of timestamps to plot
    :param figure_path:
        Path of file to save the plot in.
    """
    x = list(d.keys())
    y = list(d.values())

    plt.plot(x, y)

    plt.xlabel("Date")
    plt.ylabel("Tweets received")

    plt.gca().xaxis.set_major_formatter(md.DateFormatter("%Y-%m-%d %H:%M"))

    plt.savefig(figure_path)
    plt.show()
Ejemplo n.º 7
0
def dtree2(tbl,
           rows=None,
           lvl=-1,
           asIs=10**32,
           up=None,
           klass=-5,
           branch=[],
           f=None,
           val=None,
           opt=None,
           encode=True):
    """
    Discrete independent variables
    """
    if not opt:
        opt = Thing(min=1,
                    maxLvL=10,
                    infoPrune=1,
                    klass=-1,
                    prune=True,
                    debug=True,
                    verbose=True)

    features = fWeight(tbl)
    # if encode==True:
    #     encode(tbl, features, opt=opt)
    here = Thing(t=tbl,
                 kids=[],
                 f=f,
                 val=val,
                 up=up,
                 lvl=lvl,
                 rows=rows,
                 modes={},
                 branch=branch)

    if opt.prune and lvl < 0:
        features = fWeight(tbl)[:int(len(features) * opt.infoPrune)]

    name = features.pop(0)
    remaining = tbl[features + [tbl.columns[opt.klass]]]
    feature = tbl[name].values
    klass = tbl[tbl.columns[opt.klass:]].values
    N = len(klass)
    here.score = np.mean(klass, axis=0)
    # splits = discretize(feature, klass, discrete=True)
    LO, HI = min(feature), max(feature)

    def pairs(lst):
        while len(lst) > 1:
            yield (lst.pop(0), lst[0])

    cutoffs = [LO, HI]

    # set_trace()
    if lvl > (opt.maxLvL if opt.prune else int(len(features) * opt.infoPrune)):
        return here
    if asIs < 0.1:
        return here
    if len(features) < 1:
        return here

    def rows():
        for span in cutoffs:
            new = []
            for f, row in zip(feature, remaining.values.tolist()):
                if f == span:
                    new.append(row)
            yield pd.DataFrame(new, columns=remaining.columns), span

    ent = lambda x: sum([
        -Counter[n] / len(x) * np.log(Counter[n] / len(x))
        for n in Counter.keys()
    ])
    sdv = lambda x: np.mean(np.var(x, axis=0))

    for child, span in rows():
        # set_trace()
        n = child.shape[0]
        toBe = sdv(child[child.columns[opt.klass]])
        if opt.min <= n < N:
            here.kids += [
                dtree2(child,
                       lvl=lvl + 1,
                       asIs=toBe,
                       up=here,
                       branch=branch + [(name, span)],
                       f=name,
                       val=(span, span),
                       opt=opt,
                       encode=False)
            ]

    return here
Ejemplo n.º 8
0
def dedup_unique(Counter):
    """
    Count all unique UMIs
    """
    return len(Counter.keys())
            i = 0
            if len(data) > 0:

                while (i < len(data)):

                    row = data[i]
                    prefix = row[0]
                    IPversion = row[2]
                    CC = row[3]
                    route_collector_extracted = row[1]

                    for ixp in IXP_collector.keys(
                    ):  #delete de 2 for the complete continent

                        if ixp not in Counter.keys():
                            Counter[ixp] = {}

                        for route_collector in IXP_collector[
                                ixp]:  #delete de 2 for the complete continent
                            print ixp, route_collector

                            if str(route_collector) == str(
                                    route_collector_extracted):

                                if CC not in Counter[ixp].keys():
                                    Counter[ixp][CC] = {}

                                if IPversion not in Counter[ixp][CC].keys():
                                    Counter[ixp][CC][IPversion] = []
def birthdayCakeCandles(n, ar):
    d = Counter(ar)
    largest_key = max(Counter.keys(d))
    return d.get(largest_key)
Ejemplo n.º 11
0
                while (i<len(data)):
                    
                    row = data[i]
                    prefix = row[0]
                    IPversion = row[2]
                    CC = row[3]
                    route_collector_extracted = row[1]
                    
                    for ixp in IXP_collector.keys(): #delete de 2 for the complete continent
                        
                        for route_collector in IXP_collector[ixp]:  #delete de 2 for the complete continent
                            print ixp, route_collector
                
                            if str(route_collector) == str(route_collector_extracted):
                                
                                if CC not in Counter.keys():
                                    Counter[CC] = {}
                                
                                if IPversion not in Counter[CC].keys():
                                    Counter[CC][IPversion] = []
                                
                                if prefix not in Counter[CC][IPversion]:
                                    Counter[CC][IPversion].append(prefix)
                                        
                            else:
                                print 'RouteCollector ', route_collector, 'does not appear in  table Data__'+str(int(window[0]))+"_"+str(int(window[1]))

                    i += 1

        else:
            print "Table doesn't exist: Data__"+str(year)+"_"+str(month)