def _get_adj_list_directional(self, umis, counts): ''' identify all umis within the hamming distance threshold and where the counts of the first umi is > (2 * second umi counts)-1''' adj_list = {umi: [] for umi in umis} if self.fuzzy_match: for umi1 in umis: # we need a second regex for some insertions, # e.g UMI1 = "ATCG", UMI2 = "ATTC" comp_regex_err = regex.compile("(%s){e<=1}" % str(umi1)) comp_regex_del = regex.compile("(%s){i<=1}" % str(umi1)[::-1]) for umi2 in umis: if umi1 == umi2: continue if counts[umi1] >= (counts[umi2] * self.dir_threshold): if (max(len(umi1), len(umi2)) - min(len(umi1), len(umi2))) > 1: continue if (comp_regex_err.match(str(umi2)) or comp_regex_del.match(str(umi2))): adj_list[umi1].append(umi2) else: for umi1, umi2 in itertools.combinations(umis, 2): if edit_distance(umi1, umi2) <= 1: if counts[umi1] >= (counts[umi2] * 2) - 1: adj_list[umi1].append(umi2) if counts[umi2] >= (counts[umi1] * 2) - 1: adj_list[umi2].append(umi1) return adj_list
def get_average_umi_distance(umis): if len(umis) == 1: return -1 dists = [edit_distance(x, y) for x, y in itertools.combinations(umis, 2)] return float(sum(dists)) / (len(dists))
def _get_adj_list_adjacency(self, umis, counts, threshold): ''' identify all umis within hamming distance threshold''' return {umi: [umi2 for umi2 in umis if edit_distance(umi.encode('utf-8'), umi2.encode('utf-8')) <= threshold] for umi in umis}
def getErrorCorrectMapping(cell_barcodes, whitelist, threshold=1): ''' Find the mappings between true and false cell barcodes based on an edit distance threshold. Any cell barcode within the threshold to more than one whitelist barcode will be excluded''' true_to_false = collections.defaultdict(set) whitelist = set([str(x).encode("utf-8") for x in whitelist]) for cell_barcode in cell_barcodes: match = None barcode_in_bytes = str(cell_barcode).encode("utf-8") for white_cell in whitelist: if barcode_in_bytes in whitelist: # don't check if whitelisted continue if edit_distance(barcode_in_bytes, white_cell) <= threshold: if match is not None: # already matched one barcode match = None # set match back to None break # break and don't add to maps else: match = white_cell.decode("utf-8") if match is not None: true_to_false[match].add(cell_barcode) return true_to_false
def _get_adj_list_directional(self, umis, counts): ''' identify all umis within the hamming distance threshold and where the counts of the first umi is > (2 * second umi counts)-1''' adj_list = {umi: [] for umi in umis} if self.fuzzy_match: for umi1 in umis: # we need a second regex for some insertions, # e.g UMI1 = "ATCG", UMI2 = "ATTC" comp_regex_err = regex.compile("(%s){e<=1}" % str(umi1)) comp_regex_del = regex.compile("(%s){i<=1}" % str(umi1)[::-1]) for umi2 in umis: if umi1 == umi2: continue if counts[umi1] >= (counts[umi2]*self.dir_threshold): if (max(len(umi1), len(umi2)) - min(len(umi1), len(umi2))) > 1: continue if (comp_regex_err.match(str(umi2)) or comp_regex_del.match(str(umi2))): adj_list[umi1].append(umi2) else: for umi1, umi2 in itertools.combinations(umis, 2): if edit_distance(umi1, umi2) <= 1: if counts[umi1] >= (counts[umi2]*2)-1: adj_list[umi1].append(umi2) if counts[umi2] >= (counts[umi1]*2)-1: adj_list[umi2].append(umi1) return adj_list
def _get_adj_list_directional_adjacency(self, umis, counts, threshold): ''' identify all umis within the hamming distance threshold and where the counts of the first umi is > (2 * second umi counts)-1''' return {umi: [umi2 for umi2 in umis if edit_distance(umi.encode('utf-8'), umi2.encode('utf-8')) == 1 and counts[umi] >= (counts[umi2]*2)-1] for umi in umis}
def get_average_umi_distance(umis): if len(umis) == 1: return -1 dists = [edit_distance(x.encode('utf-8'), y.encode('utf-8')) for x, y in itertools.combinations(umis, 2)] return float(sum(dists))/(len(dists))
def _get_adj_list_directional(self, umis, counts, threshold=1): ''' identify all umis within the hamming distance threshold and where the counts of the first umi is > (2 * second umi counts)-1''' return {umi: [umi2 for umi2 in umis if edit_distance(umi.encode('utf-8'), umi2.encode('utf-8')) == threshold and counts[umi] >= (counts[umi2]*2)-1] for umi in umis}
def _get_adj_list_adjacency(self, umis, counts, threshold): ''' identify all umis within hamming distance threshold''' adj_list = {umi: [] for umi in umis} for umi1, umi2 in itertools.combinations(umis, 2): if edit_distance(umi1, umi2) <= threshold: adj_list[umi1].append(umi2) adj_list[umi2].append(umi1) return adj_list
def _get_adj_list_directional(self, umis, counts, threshold=1): ''' identify all umis within the hamming distance threshold and where the counts of the first umi is > (2 * second umi counts)-1''' adj_list = {umi: [] for umi in umis} for umi1, umi2 in itertools.combinations(umis, 2): if edit_distance(umi1, umi2) <= threshold: if counts[umi1] >= (counts[umi2] * 2) - 1: adj_list[umi1].append(umi2) if counts[umi2] >= (counts[umi1] * 2) - 1: adj_list[umi2].append(umi1) return adj_list
def _get_adj_list_adjacency(self, umis, counts, threshold): ''' identify all umis within hamming distance threshold''' adj_list = {umi: [] for umi in umis} if len(umis) > 25: umi_length = len(umis[0]) substr_idx = build_substr_idx(umis, umi_length, threshold) iter_umi_pairs = iter_nearest_neighbours(umis, substr_idx) else: iter_umi_pairs = itertools.combinations(umis, 2) for umi1, umi2 in iter_umi_pairs: if edit_distance(umi1, umi2) <= threshold: adj_list[umi1].append(umi2) adj_list[umi2].append(umi1) return adj_list
def _get_adj_list_directional(self, umis, counts, threshold=1): ''' identify all umis within the hamming distance threshold and where the counts of the first umi is > (2 * second umi counts)-1''' adj_list = {umi: [] for umi in umis} if len(umis) > 25: umi_length = len(umis[0]) substr_idx = build_substr_idx(umis, umi_length, threshold) iter_umi_pairs = iter_nearest_neighbours(umis, substr_idx) else: iter_umi_pairs = itertools.combinations(umis, 2) for umi1, umi2 in iter_umi_pairs: if edit_distance(umi1, umi2) <= threshold: if counts[umi1] >= (counts[umi2]*2)-1: adj_list[umi1].append(umi2) if counts[umi2] >= (counts[umi1]*2)-1: adj_list[umi2].append(umi1) return adj_list