def all_median(A):
    all_u = [''.join([unichr(i) for i in j]) for j in A]       #convert to UTF-16 0-2^16 range
    p = partion_fwd_rev(all_u,1)                               #pairwise edit dist partioning
    fwd_u,rev_u = split(all_u,p[1])                            #split from the r matrix
    all_m = Levenshtein.median(fwd_u+[i[::-1] for i in rev_u]) #test it on all directions
    fwd_m = Levenshtein.median(fwd_u)                          #do approximate greedy
    rev_m = Levenshtein.median(rev_u)                          #median
    all_med = [ord(i) for i in all_m]
    fwd_med = [ord(i) for i in fwd_m]
    rev_med = [ord(i) for i in rev_m]
    return all_med,fwd_med,rev_med
def all_median(A,p):
    all_u = [''.join([unichr(i) for i in j]) for j in A] #convert to UTF-16 0-2^16 range
    fwd_i,rev_i = split(A,p['r'])                    #split from the r matrix
    fwd_u = [''.join([unichr(i) for i in j]) for j in fwd_i]
    rev_u = [''.join([unichr(i) for i in j]) for j in rev_i]
    print("all median")
    all_m = lv.median(all_u) #test it on all directions
    print("fwd median")
    fwd_m = lv.median(fwd_u)
    print("rev median")
    rev_m = lv.median(rev_u)                        
    print('fwd==rev? %s'%(fwd_m==rev_m))
    all_med = [ord(i) for i in all_m]
    fwd_med = [ord(i) for i in fwd_m]
    rev_med = [ord(i) for i in rev_m]
    return {'all':all_med,'>':fwd_med,'<':rev_med} #> partition is always larger
Beispiel #3
0
def clean_hometown(sheet):
    col = get_col_number(sheet, 'HOMETOWN')
    lst = get_data_list(sheet, col)
    final = [None] * len(lst)
    remain = [i for i in range(len(lst))]
    print(Levenshtein.ratio('SICHUAN', "SI CHUAN"))
    while len(remain) > 0:
        remain_2 = []
        similar_index = []
        similar = []
        current_index = 0
        while lst[remain[current_index]] is None:
            current_index += 1
        current = lst[remain[current_index]]

        similar.append(current)
        similar_index.append(remain[current_index])
        for j in range(current_index + 1, len(remain)):
            if lst[remain[j]] is not None:
                other = lst[remain[j]]
                other_index = remain[j]
                #print(current, other, Levenshtein.ratio(current, other))
                if Levenshtein.ratio(current, other) > 0.93:
                    similar.append(other)
                    similar_index.append(other_index)
                else:
                    remain_2.append(other_index)
        median = Levenshtein.median(similar)
        for index in similar_index:
            final[index] = median
        remain = copy.deepcopy(remain_2)

    df = pd.read_csv("you_xu.csv")
    df["HOMETOWN_NEW"] = final
    df.to_csv("you_xu.csv", index=False)
Beispiel #4
0
def multiple_alignment_cost_baseline(sequences, alphabet_size=4):
    (B, K, N) = sequences.shape

    avg_distance = AverageMeter()
    min_distance = AverageMeter()
    median_distance = AverageMeter()
    quickmedian_distance = AverageMeter()

    for i in range(B):
        strings = torch_to_string(sequences[i], alphabet_size)
        distances = cross_distance_matrix(strings, strings)

        # Average distance between sequences
        avg_distance.update(np.mean(distances))

        # Center string
        min_distance.update(np.min(np.mean(distances, axis=1)))

        # Greedy median algorithm (Kruzslicz 1999)
        median = Levenshtein.median(strings)
        d_median = cross_distance_matrix([median], strings)
        median_distance.update(np.mean(d_median))

        # Greedy median algorithm (Casacuberta & Antonio 1997)
        quickmedian = Levenshtein.quickmedian(strings)
        d_quickmedian = cross_distance_matrix([quickmedian], strings)
        quickmedian_distance.update(np.mean(d_quickmedian))

    return avg_distance.avg, min_distance.avg, median_distance.avg, quickmedian_distance.avg
Beispiel #5
0
def check_dir_filename_distances(directory):
    '''
	Check a directory to be ingested for wildly divergent filenames. 
	We will currently only want to allow single-level directories of 
	files that represent parts of a whole and thus have fairly 
	similar filenames.
	'''

    _list = abspath_list(directory)
    names = []
    for name in _list:
        if os.path.isfile(name):
            if not os.path.basename(name).startswith('.'):
                names.append(name)
    median = Levenshtein.median(names)
    # print(median)
    outliers = 0  # start a counter for the number of files that diverge from the median name
    outlierList = []  # and list them
    for name in names:
        distance = Levenshtein.distance(median, name)
        # print(distance)
        if distance > 10:
            outliers += 1
            outlierList.append(name)

    return outliers, outlierList
def get_umi_groups(bam_file, edit_distance):
    """Retrieve UMI groups with UMIs within a certain edit distance.

    Based on logic from:
    http://stackoverflow.com/a/35173198/252589
    """
    all_umis = set([])
    with pysam.AlignmentFile(bam_file, "rb", check_sq=False) as bam_iter:
        for rec in bam_iter:
            all_umis.add(rec.get_tag("RX"))
    print(len(all_umis))
    grs = []
    for i, cur_umi in enumerate(sorted(all_umis)):
        if i % 1000 == 0:
            print(i, len(grs))
        if edit_distance == 0:
            grs.append([cur_umi])
        else:
            for g in grs:
                if any(
                        Levenshtein.distance(cur_umi, w) <= edit_distance
                        for w in g):
                    g.append(cur_umi)
                    break
            else:
                grs.append([cur_umi])
    out = {}
    for cur_gr in grs:
        base = Levenshtein.median(cur_gr)
        for gr in cur_gr:
            out[gr] = base
    return out
def plot_distances(projects_filename):
    with open(projects_filename, 'rU') as f:
        config = json.load(f)
    populate_key_references(config['regions'])
    groups = defaultdict(list)
    for name, region in config['regions'].iteritems():
        seed_group = region['seed_group']
        if seed_group and seed_group.startswith('HCV-'):
            groups[seed_group].append(name)
    del groups['HCV-seeds']
    group_names = groups.keys()
    group_names.sort()
    source_seed_names = []
    all_seeds = {}  # {name: (group_index, reference)}
    median_references = []
    group_labels = []
    for group_index, group_name in enumerate(group_names):
        logger.info('Grouping %s.', group_name)
        seed_names = groups[group_name]
        seed_names.sort()
        source_seed_names.append(seed_names[0])
        references = []
        for seed_name in seed_names:
            reference = ''.join(config['regions'][seed_name]['reference'])
            all_seeds[seed_name] = (group_index, reference)
            references.append(reference)
        median_references.append(Levenshtein.median(references))
        group_labels.append(group_name[4:-6])  # trim HCV- and -seeds
    config = None

    intragroup_source_groups = []
    intragroup_distances = []
    intergroup_source_groups = []
    intergroup_distances = []

    for source_index, source_group_name in enumerate(group_names):
        logger.info('Processing %s.', source_group_name)
        source_reference = median_references[source_index]
        for dest_index, dest_reference in all_seeds.itervalues():
            distance = calculate_distance(source_reference, dest_reference)
            if source_index == dest_index:
                intragroup_source_groups.append(source_index)
                intragroup_distances.append(distance)
            else:
                intergroup_source_groups.append(source_index)
                intergroup_distances.append(distance)

    fig = plt.figure()
    ax = fig.add_subplot(
        111,
        title='Distance From Genotype Median Reference in Key Regions',
        xlabel='genotype',
        ylabel='Levenshtein distance',
        xticks=range(len(group_labels)),
        xticklabels=group_labels)
    ax.plot(intragroup_source_groups, intragroup_distances, 'go', alpha=0.4)
    ax.plot(intergroup_source_groups, intergroup_distances, 'ro', alpha=0.4)
    ax.margins(0.1)
    plt.show()
def get_popular_string(all_strings):
    '''Finds most likely OCR output string based on string median calculations.
    Arguments:
        all_strings: All different OCR strings corresponding to the same x-ray image.
    Returns:
        Most Likely string.
    '''
    return lv.median(list(map(lambda x: x.upper(), all_strings)))
Beispiel #9
0
def median_word(words, word_counts):
    median = lev.median(words, word_counts)
    while True:
        last_median = median
        median = lev.median_improve(median, words, word_counts)
        if median == last_median:
            break
    return median
Beispiel #10
0
def plot_distances(projects_filename):
    with open(projects_filename, 'rU') as f:
        config = json.load(f)
    populate_key_references(config['regions'])
    groups = defaultdict(list)
    for name, region in config['regions'].iteritems():
        seed_group = region['seed_group']
        if seed_group and seed_group.startswith('HCV-'):
            groups[seed_group].append(name)
    del groups['HCV-seeds']
    group_names = groups.keys()
    group_names.sort()
    source_seed_names = []
    all_seeds = {}  # {name: (group_index, reference)}
    median_references = []
    group_labels = []
    for group_index, group_name in enumerate(group_names):
        logger.info('Grouping %s.', group_name)
        seed_names = groups[group_name]
        seed_names.sort()
        source_seed_names.append(seed_names[0])
        references = []
        for seed_name in seed_names:
            reference = ''.join(config['regions'][seed_name]['reference'])
            all_seeds[seed_name] = (group_index, reference)
            references.append(reference)
        median_references.append(Levenshtein.median(references))
        group_labels.append(group_name[4:-6])  # trim HCV- and -seeds
    config = None

    intragroup_source_groups = []
    intragroup_distances = []
    intergroup_source_groups = []
    intergroup_distances = []

    for source_index, source_group_name in enumerate(group_names):
        logger.info('Processing %s.', source_group_name)
        source_reference = median_references[source_index]
        for dest_index, dest_reference in all_seeds.itervalues():
            distance = calculate_distance(source_reference, dest_reference)
            if source_index == dest_index:
                intragroup_source_groups.append(source_index)
                intragroup_distances.append(distance)
            else:
                intergroup_source_groups.append(source_index)
                intergroup_distances.append(distance)

    fig = plt.figure()
    ax = fig.add_subplot(111,
                         title='Distance From Genotype Median Reference in Key Regions',
                         xlabel='genotype',
                         ylabel='Levenshtein distance',
                         xticks=range(len(group_labels)),
                         xticklabels=group_labels)
    ax.plot(intragroup_source_groups, intragroup_distances, 'go', alpha=0.4)
    ax.plot(intergroup_source_groups, intergroup_distances, 'ro', alpha=0.4)
    ax.margins(0.1)
    plt.show()
Beispiel #11
0
def get_median(words, counts):
    median = lev.median(words, counts)
    # print median,
    while True:
        last_median = median
        median = lev.median_improve(median, words, counts)
        # print "-->", median,
        if median == last_median:
            break
    return median
Beispiel #12
0
	def find_median(self):
		most_common = self.members.most_common(2);

		# we have a tie for the mode
		if most_common[0][1] == most_common[0][0]:
			# account for prefix/suffix stuff too
			#text = map(lambda ele: ele if isinstance(ele, str) else ele[0] + ele[1], self.members)
			#print "\t", text
			return Levenshtein.median(self.members.keys(), self.members.values())
		else:
			return most_common[0][0]
Beispiel #13
0
    def find_median(self):
        most_common = self.members.most_common(2)

        # we have a tie for the mode
        if most_common[0][1] == most_common[0][0]:
            # account for prefix/suffix stuff too
            #text = map(lambda ele: ele if isinstance(ele, str) else ele[0] + ele[1], self.members)
            #print "\t", text
            return Levenshtein.median(self.members.keys(),
                                      self.members.values())
        else:
            return most_common[0][0]
Beispiel #14
0
def CalculateMedoid(dico_vjunc, Dicoresult):
    centroid = {}
    #print (Dicoresult)
    for key in Dicoresult.keys():
        listloc = []
        for seq in Dicoresult[key]:
            if seq.rstrip() in dico_vjunc.keys():
                listloc.append(dico_vjunc[seq.rstrip()][3])
        if len(listloc) != 0:
            centroid[key] = Levenshtein.median(listloc)
    #print ("centroid",centroid)
    return centroid
 def median_string(self, strings, string_counts):
     # Find a 'median string' which is a string with the minimal
     # sum of edit distances to each string in a list of strings.
     # see: https://pypi.python.org/pypi/python-Levenshtein/0.11.2
     # Note the python package above also permits the use of
     # weights for each string (e.g. counts of occurances)
     median = Levenshtein.median(strings, string_counts)
     while True:
         last_median = median
         median = Levenshtein.median_improve(median, strings, string_counts)
         if median == last_median:
             break
     return median
Beispiel #16
0
def CalculateMedoid(Dicofasta, Dicoresult):
    centroid = {}
    print "Calculating Medoid sequence of each cluster ... \n"
    for key in tqdm.tqdm(Dicoresult.keys()):
        listloc = []
        for seq in Dicoresult[key]:
            if seq.rstrip() in Dicofasta.keys():
                listloc.append(Dicofasta[seq.rstrip()])
            else:
                print "Caution the", seq.rstrip(), "is not in the fasta file"

        centroid[key] = Levenshtein.median(listloc)

    return centroid
Beispiel #17
0
def CalculateMedoid(dico_vjunc, Dicoresult):
    centroid = {}
    #print ("Calculating Medoid sequence of each cluster ... \n")
    #for key in tqdm.tqdm(Dicoresult.keys()) :
    for key in Dicoresult.keys():
        listloc = []
        for seq in Dicoresult[key]:
            if seq.rstrip() in dico_vjunc.keys():
                listloc.append(dico_vjunc[seq.rstrip()][2])
            else:
                print("Caution the", seq.rstrip(), "is not in the fasta file")
        centroid[key] = Levenshtein.median(listloc)
    #print (centroid)
    return centroid
def choose_normalized(connected_comp, choice='lvst'):
    '''
    choose one normalized term for each connected component by rule-based system
    NOTE: "longest = False" will choose the shortest
    '''
    res = list()
    for comp in connected_comp:

        # Rule-1: no period at the end
        temp = [name for name in comp if name[-1] != '.']
        comp = temp if temp else comp

        # Rule-2.1: no hyphen anywhere
        temp = [name for name in comp if '-' not in name]
        comp = temp if temp else comp

        # Rule-2.2: no '/' anywhere
        temp = [name for name in comp if '/' not in name]
        comp = temp if temp else comp

        # Rule-3: start with uppercase
        temp = [name for name in comp if name[0].isupper()]
        comp = temp if temp else comp

        # Rule-4: no unicode (no '\u2003')
        temp = []
        for name in comp:
            normal = unicodedata.normalize('NFKD',
                                           name).encode('ASCII', 'ignore')
            val = normal.decode("utf-8")
            temp.append(val)

        comp = list(set(temp)) if temp else comp

        # Rule-5: choose by length or Levenshtein Median
        comp.sort(key=len)
        if choice == 'long':
            res.append(comp[-1])
        elif choice == 'short':
            res.append(comp[0])
        elif choice == 'lvst':
            res.append(lvst.median(comp))
        else:
            print("Not a valid choice methond")
            return None

    return res
Beispiel #19
0
def domainMedian(domainObjs, numSamples=200):
    """
    Compute the median Domain object from a list of Domain objects. The median
    is defined as the string that is computed from the per-domain-level
    Levenshtein string medians.

    if <numSamples> is set to a value > 0, this number of samples will be
    picked randomly from <domainObjs>, and the median is then computed from
    this set.

    returns a Domain object
    """
    if numSamples and len(domainObjs)>numSamples:
        domainObjs=list(random.sample(domainObjs, numSamples))

    data=[(d.rSplitView(), 1) for d in domainObjs]
    mxIdx=max([len(d) for d,_ in data])
    medianParts=[]

    for i in range(mxIdx):
        occurrencesWithWeights = _getLD(data, i)
        domainLevels,levelWeights = zip(*occurrencesWithWeights)
        try:
            ldMedian = lev.median(domainLevels, levelWeights)
        except TypeError:
            logging.error('median error: '+str(domainLevels))
        else:
            if ldMedian:
                """
                ignore empty medians; prepend this level to output
                """
                medianParts.insert(0, ldMedian)

    """
    we construct the final median now directly from the constructed parts, i.e.
    we don't let the DomainStr constructor split it in parts which might be
    different from the parts we found here, and would therefore impair the
    alignment for comparisons later.
    """
    medianObj = DomainStr(medianParts)
    return medianObj
def median(A):
    u = [''.join([unichr(i) for i in j]) for j in A]
    u_m = lv.median(u)
    return [ord(i) for i in u_m]
Beispiel #21
0
    for name in name_all:
        output_file.write(name + "\t")
    lang_word = []
    lang_roman_word = []

    median_word = ""
    median_roman = ""
    if index in findings:

        for pair in findings[index]:
            lang_word.append(pair[0])
            lang_roman_word.append(pair[1].lower())
            try:
                if len(lang_word) > 3:

                    median_roman = Levenshtein.median(lang_roman_word)
                    pos = lang_roman_word.index(median_roman)
                    median_word = lang_word[pos]
                    print("Got one median")
                else:
                    median_word = ", ".join(lang_word)
                    median_roman = ", ".join(lang_roman_word)
            except Exception as e:
                median_word = ", ".join(lang_word)
                median_roman = ", ".join(lang_roman_word)

    output_file.write(median_word + "\t" + median_roman + "\n")
output_file.close()

## to get the romanized form of all greek names we have
# for strng , greek_forms in zip(names_strong,greek_name_forms):