Ejemplo n.º 1
0
 def extract(self, x, y):
     if x is None or y is None:
         return 0
     if self.similarity:
         return 1 - float(hamming_distance(unicode(x), unicode(y))) / max(len(x), len(y))
     else:
         return hamming_distance(unicode(x), unicode(y))
Ejemplo n.º 2
0
def build_read_names_given_seq(target,
                               read_names_by_seq_fpath,
                               allowed_read_names_set,
                               is_interesting_seq,
                               max_ham,
                               verbose=True):
    interesting_reads = defaultdict(set)
    for i, line in enumerate(open(read_names_by_seq_fpath)):
        if verbose and i % 10000 == 0:
            sys.stdout.write('.')
            sys.stdout.flush()

        words = line.strip().split()
        seq = words[0]
        if is_interesting_seq(seq):
            read_names = set(words[1:]) & allowed_read_names_set
            interesting_reads[seq].update(read_names)
            last_start = len(seq) - len(target)
            if last_start < 0:
                continue
            min_ham_idx = min(
                range(0, last_start + 1),
                key=lambda i: hamming_distance(unicode(
                    target), unicode(seq[i:i + len(target)])))
            min_ham = hamming_distance(
                unicode(target),
                unicode(seq[min_ham_idx:min_ham_idx + len(target)]))
            if min_ham <= max_ham:
                min_ham_seq = seq[min_ham_idx:min_ham_idx + len(target)]
                interesting_reads[min_ham_seq].update(read_names)
    return interesting_reads
Ejemplo n.º 3
0
def distance_filter(df,
                    c,
                    thresh=3,
                    suffix1='_x',
                    suffix2='_y',
                    col1=None,
                    col2=None,
                    nonull=None):
    if df.shape[0] == 0:
        df = pd.DataFrame()
    else:
        if (col1 is not None) and (col2 is not None):
            c1 = col1 + suffix1
            c2 = col2 + suffix2
        else:
            c1 = c + suffix1
            c2 = c + suffix2

        if nonull is not None:
            df['distance'] = df.apply(
                lambda x: jf.hamming_distance(x[c1], x[c2]), axis=1)
        else:
            df['distance'] = df.apply(
                lambda x: 10
                if (pd.isnull(x[c1])
                    | pd.isnull(x[c2])) else jf.hamming_distance(x[c1], x[c2]),
                axis=1)
        df = df[df.distance <= thresh]

    return df
Ejemplo n.º 4
0
def output_calculation(id_1, id_2, id_1_column1_name, id_2_column1_name,
                       id_1_column2_name, id_2_column2_name):
    return [
        str(id_1),
        str(id_2),
        str(jellyfish.hamming_distance(
            id_1_column1_name,
            id_2_column1_name)),  ##################Change Algorithm here!!!
        str(jellyfish.hamming_distance(id_1_column2_name, id_2_column2_name))
    ]
Ejemplo n.º 5
0
def write_distance2csv(column1_name, column2_name):
    process = 0
    process_sub = 0
    total = 100
    #    distance_info = []
    with open(path_1) as f1, open(column1_name + column2_name +
                                  '_similarity.csv',
                                  'w+',
                                  newline='') as csv_file:
        headers = [
            'id_1', 'id_2', column1_name + '_similarity',
            column2_name + '_similarity'
        ]
        writer = csv.writer(csv_file)
        writer.writerow(headers)
        reader1 = csv.DictReader(f1)
        for id_1 in reader1:
            if process >= 100:
                break


#                logger.info(id_1)
            process += 1
            with open(path_2) as f2:
                reader2 = csv.DictReader(f2)
                process_sub = 0
                for id_2 in reader2:
                    #                    logger.info(id_1)
                    process_sub += 1
                    logger.info('processing: ' + str(process) + '/' +
                                str(total) + '__' + str(process_sub) + '/' +
                                str(total))
                    logger.info(id_1['EnterpriseID'])
                    logger.info(len(id_2['EnterpriseID']))
                    if len(id_2['EnterpriseID']) == 0:
                        break
                    elif id_1['EnterpriseID'] >= id_2['EnterpriseID']:
                        continue
                    else:
                        try:
                            writer.writerow([
                                str(id_1['EnterpriseID']),
                                str(id_2['EnterpriseID']),
                                str(
                                    jellyfish.hamming_distance(
                                        id_1[column1_name],
                                        id_2[column1_name])),
                                str(
                                    jellyfish.hamming_distance(
                                        id_1[column2_name],
                                        id_2[column2_name]))
                            ])
                        except Exception as e:
                            logger.info(Exception, ': ', e)
Ejemplo n.º 6
0
def get_closest_hamming(needle, haystack):
    closest = None
    for x in haystack:
        if (closest == None):
            closest = (x, jellyfish.hamming_distance(needle, x))
        else:
            temp = (x, jellyfish.hamming_distance(needle, x))
            if (temp[1] < closest[1]):
                closest = temp
    if (closest == None):
        return None
    return closest[0]
def get_closest_hamming(needle,haystack):
	closest = None;
	for x in haystack:
		if(closest == None):
			closest = (x,jellyfish.hamming_distance(needle,x));
		else:
			temp = (x,jellyfish.hamming_distance(needle,x));
			if(temp[1] < closest[1]):
				closest = temp;
	if(closest == None):
		return None;
	return closest[0];
 def compare_two_texts(self, string_a, string_b, normalize_value=True):
     """
     Compare two string and return the value of Hamming algorithm
     the value is normalized between 0 and 1 values.
     """
     if ((isinstance(string_a, unicode) and isinstance(string_b, unicode)) or
             (isinstance(string_a, str) and isinstance(string_b, str))):
         if normalize_value:
             return self.__normalized_value(jellyfish.hamming_distance(string_a, string_b))
         else:
             return jellyfish.hamming_distance(string_a, string_b)
     else:
         raise TypeError
Ejemplo n.º 9
0
def write_distance2csv(column1_name, column2_name):
    process = 0
    process_sub = 0
    total = 100
#    distance_info = []
    with open(path_1) as f1, open(column1_name + column2_name + '_similarity.csv', 'w+', newline='') as csv_file:
        headers = ['id_1', 'id_2', column1_name + '_similarity', column2_name + '_similarity']
        writer = csv.writer(csv_file)
        writer.writerow(headers)
        reader1 = csv.DictReader(f1)
        for id_1 in reader1:
            if process >= 100:
                break
#                logger.info(id_1)
            process += 1
            with open(path_2) as f2:
                reader2 = csv.DictReader(f2)
                process_sub = 0
                for id_2 in reader2:
    #                    logger.info(id_1)
                    process_sub += 1
                    logger.info('processing: ' + str(process) + '/' + str(total) + '__' + str(process_sub) + '/' + str(total))
                    logger.info(id_1['EnterpriseID'])
                    logger.info(len(id_2['EnterpriseID']))
                    if len(id_2['EnterpriseID']) == 0:
                        break
                    elif id_1['EnterpriseID'] >= id_2['EnterpriseID']:
                        continue
                    else:
                        try:
                            writer.writerow([str(id_1['EnterpriseID']), str(id_2['EnterpriseID']),
                            str(jellyfish.hamming_distance(id_1[column1_name], id_2[column1_name])), str(jellyfish.hamming_distance(id_1[column2_name], id_2[column2_name]))])
                        except Exception as e:
                            logger.info(Exception, ': ', e)
Ejemplo n.º 10
0
def get_similar_v_genes():
    """Returns a dictionary of V genes 90% similar to a given V gene.

    Parameters
    ----------
    None

    Returns
    -------
    v_to_include : dict
        Dictionary where the keys are V genes and the items are V genes
        90% similar to key.
    """

    v_ref = make_v_ref_dict()
    v_ref_genes = list(v_ref.keys())
    v_ham_mat = np.zeros(shape=(len(v_ref), len(v_ref)))

    for idx1, v1 in enumerate(v_ref):
        for idx2, v2 in enumerate(v_ref):
            seq1 = v_ref[v1]
            seq2 = v_ref[v2]
            min_len = np.min([len(seq1), len(seq2)])
            #  Go backwards from where the CDR3 begins.
            v_ham_mat[idx1, idx2] = hamming_distance(seq1[-min_len:], seq2[-min_len:]) / min_len

    v_to_include = {}
    for idx1, v1 in enumerate(v_ref_genes):
        v_to_include[v1] = []
        for idx2, v2 in enumerate(v_ref_genes):
            if v_ham_mat[idx1, idx2] <= 0.1:
                v_to_include[v1].append(v2)
    return v_to_include
Ejemplo n.º 11
0
def simple_example():
    # String comparison.
    str1, str2 = u'jellyfish', u'smellyfish'

    print("jellyfish.levenshtein_distance({}, {}) = {}.".format(
        str1, str2, jellyfish.levenshtein_distance(str1, str2)))
    print("jellyfish.damerau_levenshtein_distance({}, {}) = {}.".format(
        str1, str2, jellyfish.damerau_levenshtein_distance(str1, str2)))
    print("jellyfish.hamming_distance({}, {}) = {}.".format(
        str1, str2, jellyfish.hamming_distance(str1, str2)))
    print("jellyfish.jaro_distance({}, {}) = {}.".format(
        str1, str2, jellyfish.jaro_distance(str1, str2)))
    print("jellyfish.jaro_similarity({}, {}) = {}.".format(
        str1, str2, jellyfish.jaro_similarity(str1, str2)))
    print("jellyfish.jaro_winkler({}, {}) = {}.".format(
        str1, str2, jellyfish.jaro_winkler(str1, str2)))
    print("jellyfish.jaro_winkler_similarity({}, {}) = {}.".format(
        str1, str2, jellyfish.jaro_winkler_similarity(str1, str2)))
    print("jellyfish.match_rating_comparison({}, {}) = {}.".format(
        str1, str2, jellyfish.match_rating_comparison(str1, str2)))

    #--------------------
    # Phonetic encoding.
    ss = u'Jellyfish'

    print("jellyfish.metaphone({}) = {}.".format(ss, jellyfish.metaphone(ss)))
    print("jellyfish.soundex({}) = {}.".format(ss, jellyfish.soundex(ss)))
    print("jellyfish.nysiis({}) = {}.".format(ss, jellyfish.nysiis(ss)))
    print("jellyfish.match_rating_codex({}) = {}.".format(
        ss, jellyfish.match_rating_codex(ss)))
def select_fitness(population, target, population_size):
    fitness_size = population_size * 2
    res_list = fitness_size * [None]
    for index, a_str in enumerate(population):
        comp_dist = hamming_distance(a_str, target)
        res_list[index] = (index, a_str, comp_dist)  # index, string, fitness
    return res_list
Ejemplo n.º 13
0
def alldist(filex, filey):
    xread = open(filex, 'r').read()
    yread = open(filey, 'r').read()
    lvd = jellyfish.levenshtein_distance(xread,yread)
    dlvd= jellyfish.damerau_levenshtein_distance(xread,yread)
    spsum = spamsum.match(xread,yread)
    spsum = 100 - spsum
    spsum = float(spsum/100.00)
#    print lvd
    res = float( lvd / 100.00 )
    dres= float(dlvd / 100.00 )
#    print res
#    print "Levenshtein Distance=",res
    jaro = jellyfish.jaro_distance(xread,yread)
## Added jaro-winkler distance by fahim 20111011
    jarowink = jellyfish.jaro_winkler(xread,yread)
    jaro = 1.0 - jaro
    jarowink = 1.0 - jarowink
#   print "Jaro Distance = ",jaro
    ham = jellyfish.hamming_distance(xread,yread)
    ham = float ( ham / 100.00)
    print "Hamming Distance = ", ham
#	print "KL-divergence between d1 and d2:", kldiv(tokenize(d1), tokenize(d2))
#	print "KL-divergence between d2 and d1:", kldiv(tokenize(d2), tokenize(d1))
#    print "Spamsum Match score: ", spsum
    kl = kldiv(tokenize(xread), tokenize(yread))

    return res, dres , jaro, jarowink, ham, kl, spsum
def correct_barcode_map(config_params, barcodes_in_data, barcode_to_gene):

    barcode_tolerance = int(config_params['barcode_tolerance'])

    orig_barcodes = set(barcode_to_gene.keys())
    full_map = {x:x for x in orig_barcodes}
    correcting_map = {}

    unmatched_barcodes = set(barcodes_in_data).difference(orig_barcodes)
    for orig_barcode in barcode_to_gene.keys():
        for unmatched_barcode in unmatched_barcodes:
            barcode_dist = jf.hamming_distance(orig_barcode, unmatched_barcode)
            if barcode_dist <= barcode_tolerance:
                if get_verbosity(config_params) >= 3:
                    print 'bad : corrected --> {0} : {1}'.format(unmatched_barcode, orig_barcode)
                if correcting_map.has_key(unmatched_barcode):
                    correcting_map[unmatched_barcode].append(orig_barcode)
                else:
                    correcting_map[unmatched_barcode] = [orig_barcode]

    # Now, filter out any unmatched barcodes that map to multiple original barcodes
    for key in correcting_map.keys():
        if len(correcting_map[key]) > 1:
            correcting_map[key].pop()

    # The corrected barcodes are still lists - turn them back into strings!
    corrected_barcodes = correcting_map.keys()
    for barcode in corrected_barcodes:
        correcting_map[barcode] = correcting_map[barcode][0]

    # Update the mapping of original barcodes to themselves with the mapping of
    # unmatched barcodes to original barcodes
    full_map.update(correcting_map)
    
    return full_map
Ejemplo n.º 15
0
def ham_dist_vectorform(strings: list) -> np.array:
    """Constructs the Hamming distance vector-form for a VJL grouping used for clustering.

    This function takes in a set of strings, the observed CDR3s in a VJL grouping,
    and computes the upper triangle of the Hamming (normalized by length) square matrix.
    This vector-form is what is used as input for the single-linkage clustering
    algorithm given by scipy.

    Parameters
    ----------
    strings : list of strings

    Returns
    -------
    dists : np.array
        Vector-form of normalized Hamming distances with np.float16 precision.
    """

    normalization = len(strings[0])
    num_seqs = len(strings)
    num_entries = int((num_seqs**2 - num_seqs) / 2)
    dists = np.zeros(num_entries,dtype=np.float16)
    index = 0
    for i,s1 in enumerate(strings):
        for s2 in strings[i + 1:]:
            dists[index] = hamming_distance(s1,s2)
            index += 1
    return dists / normalization
Ejemplo n.º 16
0
def inverse_hamming_dist(row):
    return 1.0 / (
        jellyfish.hamming_distance(
        UnicodeDammit(str(row["question1"])).markup.lower(),
        UnicodeDammit(str(row["question2"])).markup.lower())
        or
        1.0
    )
Ejemplo n.º 17
0
def get_max_ham_dists(min_len, max_len):
    dists = defaultdict(list)
    for _ in xrange(50000):
        ref_seq = rand_seq(max_len)
        new_seq = rand_seq(max_len)
        for i in range(min_len, max_len+1):
            dists[i].append(hamming_distance(unicode(ref_seq[:i]), unicode(new_seq[:i])))
    max_ham_dists = [min(np.percentile(dists[i], 0.1), int(i/4)) for i in range(min_len, max_len+1)]
    return max_ham_dists
Ejemplo n.º 18
0
def barcode_hamming(observed,barcodes):
    """Compute entropy of probabilistic barcode assignment.
    
    observed -- SeqRecord of the barcode
    barcodes -- list of barcode possibilities (python strings)
    """
    obs_seq = observed.seq.tostring()
    distances = [(barcode,hamming_distance(obs_seq,barcode)) for barcode in barcodes]
    closest = min(distances,key=lambda p: p[1])
    return closest  # tuple of (barcode, distance)
Ejemplo n.º 19
0
def barcode_hamming(observed,barcodes):
    """Compute entropy of probabilistic barcode assignment.
    
    observed -- SeqRecord of the barcode
    barcodes -- list of barcode possibilities (python strings)
    """
    obs_seq = observed.seq.tostring()
    distances = [(barcode,hamming_distance(obs_seq,barcode)) for barcode in barcodes]
    closest = min(distances,key=lambda p: p[1])
    return closest  # tuple of (barcode, distance)
Ejemplo n.º 20
0
def hamming_pred(string, dictionary):
    from jellyfish import hamming_distance

    distances = []
    for item in dictionary:
        distances.append(hamming_distance(string, item))

    min_distance = min(distances)
    min_index = distances.index(min_distance)
    return dictionary[min_index], min_distance
Ejemplo n.º 21
0
def measure_mrn_similarity(ssn1, ssn2, sign):
    if ssn1 == "" or ssn2 == "" or ssn1 is None or ssn2 is None:
        return 0

    r1 = jellyfish.jaro_winkler(ssn1, ssn2)
    r2 = 1 - jellyfish.hamming_distance(ssn1, ssn2) / len(ssn1)

    if sign == "t":
        print("jw-{} vs hd-{}".format(r1, r2))
    elif sign == "w":
        return max(r1, r2)
Ejemplo n.º 22
0
 def hamming(self):
     self.cluster = []
     already = []
     for i in range(0,len(self.group)):
         if self.group[j] in already: 
                 continue
         for j in range(i+1, len(self.group)):
             if self.radius >= jf.hamming_distance(str(self.group[i]),str(self.group[j])):
                 self.cluster.append([self.group[i],self.group[j]])
                 already = flatten(self.cluster)
     return com_check(self.cluster)  
Ejemplo n.º 23
0
    def test_hamming_distance(self):
        cases = [("", "", 0),
                 ("", "abc", 3),
                 ("abc", "abc", 0),
                 ("acc", "abc", 1),
                 ("abcd", "abc", 1),
                 ("abc", "abcd", 1),
                 ("testing", "this is a test", 13),
                 ]

        for (s1, s2, value) in cases:
            self.assertEqual(jellyfish.hamming_distance(s1, s2), value)
def measure_distance(word1, word2, distance_type):
    if distance_type == 'lv':
        distance = Levenshtein.eval(word1, word2)
    if distance_type == 'dlv':
        distance = jellyfish.damerau_levenshtein_distance(word1, word2)
    if distance_type == 'jw':
        # Jaro–Winkler indicates the similiraty, we take the inverse
        distance = -jellyfish.jaro_winkler_similarity(word1, word2)
    if distance_type == 'j':
        distance = -jellyfish.jaro_similarity(word1, word2)
    if distance_type == 'hm':
        distance = jellyfish.hamming_distance(word1, word2)
    return distance
Ejemplo n.º 25
0
def getSimilarity(str1, str2):
    distance = {}
    if distance_metric1 == "JaroWinkler":
        distance[distance_metric1] = jellyfish.jaro_winkler(str1, str2)
    if distance_metric2 == "Jaro":
        distance[distance_metric2] = jellyfish.jaro_distance(str1, str2)
    if distance_metric3 == "MatchRating":
        distance[distance_metric3] = jellyfish.match_rating_comparison(
            str1, str2)
    if distance_metric4 == "Levenshtein":
        distance[distance_metric4] = jellyfish.levenshtein_distance(str1, str2)
    if distance_metric5 == "Hamming":
        distance[distance_metric5] = jellyfish.hamming_distance(str1, str2)
    return distance
Ejemplo n.º 26
0
    def remove_duplications(self):
        print("\nRemoving duplications...")
        duplications = []

        for ix in range(len(self.data)):
            for yx in range(ix, len(self.data)):
                if ix != yx:
                    if (jf.hamming_distance(
                            str(self.data.summary[ix])[0:200],
                            str(self.data.summary[yx])[0:200]) <= 20):
                        duplications.append(yx)

        duplications = list(set(duplications))
        self.data = self.data[~self.data.index.isin(duplications)].reset_index(
            drop=True)
def read_fastq(config_params, species_config_params, folder, out_path, lane_id):

    common_primer_start = int(species_config_params['common_primer_start'])
    common_primer_end = common_primer_start + int(species_config_params['common_primer_length'])
    common_primer_seq = species_config_params['common_primer_sequence']
    common_primer_tolerance = int(config_params['common_primer_tolerance'])
    index_tag_start = int(species_config_params['index_tag_start'])
    index_tag_end = index_tag_start + int(species_config_params['index_tag_length'])
    barcode_start = int(species_config_params['genetic_barcode_start'])
    barcode_end = barcode_start + int(species_config_params['genetic_barcode_length'])

    out_filename = get_barseq_filename(config_params, lane_id)
    # print out_filename
    # print common_primer_tolerance

    of = open(out_filename, 'wt')

    fastq_filenames = [os.path.join(folder, x) for x in os.listdir(folder) if is_fastq_filename(x)]

    common_primer_count = 0
    barcodes = set()
    index_tags = set()
    for filename in fastq_filenames:
        f = cfo.get_compressed_file_handle(filename)
        for line_count, line in enumerate(f):
            if get_verbosity(config_params) >= 3:
                print line_count, line
            if line_count % 4 == 1:
                string = line.strip()
                common_primer = string[common_primer_start:common_primer_end]
                common_primer_dist = jf.hamming_distance(common_primer, common_primer_seq)
                if get_verbosity(config_params) >= 3:
                    print common_primer_dist
                if common_primer_dist <= common_primer_tolerance:
                    common_primer_count += 1
                    index_tag = string[index_tag_start:index_tag_end]
                    barcode = string[barcode_start:barcode_end]
                    index_tags.update(set([index_tag]))
                    barcodes.update(set([barcode]))
                    # print "index_tag, barcode : {}, {}".format(index_tag, barcode)
                    of.write('{0}\t{1}\n'.format(index_tag, barcode))
        f.close()

    total_counts = (line_count + 1)/ 4

    of.close()

    return total_counts, common_primer_count, barcodes, index_tags
Ejemplo n.º 28
0
    def comparacion_pares(self, texto1, texto2, tipo="levenshtein", norm=None):
        """
        Permite hacer comparaciones entre dos textos de entrada, de acuerdo a \
        un tipo de distancia o similitud determinado.

        :param texto1: Primer texto de interés a comparar.
        :type texto1: str
        :param texto2: Segundo texto de interés a comparar.
        :type texto2: str
        :param tipo: Criterio de comparación a utilizar entre los textos. \
            Valor por defecto `'levenshtein'`.
        :type tipo: {'damerau_levenshtein', 'levenshtein', 'hamming', \
            'jaro_winkler', 'jaro'}, opcional
        :param norm: Permite normalizar los resultados en función de la \
            longitud de los textos. Si `norm = 1` se normaliza en función al \
            texto más corto, si `norm = 2` se normaliza en función al texto \
            de mayor extensión.
        :type norm: {1,2}, opcional
        :return: (float) Valor resultado de la comparación entre `texto1` y \
            `texto2`.
        """
        tipo = tipo.lower()
        if "damerau" in tipo:
            salida = jellyfish.damerau_levenshtein_distance(texto1, texto2)
        elif "levenshtein" in tipo:
            salida = jellyfish.levenshtein_distance(texto1, texto2)
        elif "hamming" in tipo:
            salida = jellyfish.hamming_distance(texto1, texto2)
        elif "winkler" in tipo:
            salida = jellyfish.jaro_winkler_similarity(texto1, texto2)
        elif "jaro" in tipo:
            salida = jellyfish.jaro_similarity(texto1, texto2)
        else:
            print(
                (
                    "Por favor seleccione un criterio válido "
                    "para comparar los strings."
                )
            )
            return None
        if norm in [1, 2] and "jaro" not in tipo:
            if norm == 1:
                salida /= min(len(texto1), len(texto2))
            else:
                salida /= max(len(texto1), len(texto2))
        return salida
Ejemplo n.º 29
0
def rmBadSeqs(fname, canonIndex):
	# Remove the sequences that have only one coding 
	# posibility and are more than edit distance 1 
	# away from all canonical sequences for this target 
	# that meet the entropy cutoff.
	
	# Get all sequences meeting the entropy threshold
	fin = open(fname, 'r')
	goodSeqs = [i.strip().split('\t')[0] for i in fin.readlines()\
	            if eval(i.strip().split('\t')[3]) != 1]
	fin.close()
	
	# Convert to canonical set of sequences
	canonSeqs = []
	for seq in goodSeqs:
		canonSeq = ''
		for i in canonIndex:
			canonSeq += seq[i]
		canonSeqs.append(canonSeq)
	canonSeqs = set(canonSeqs)
	#print canonSeqs

	# Write all the good sequences from fname into a temp 
	# file and then replace fname with the tmp file
	fin = open(fname, 'r')
	fout = open('tmp.txt', 'w')
	for line in fin:
		sp_line = line.strip().split('\t')
		numPoss = eval(sp_line[3])
		if numPoss > 1:
			fout.write(line)
		else:
			seq = sp_line[0]
			seq2 = ''
			for i in canonIndex:
				seq2 += seq[i]
			for canonSeq in canonSeqs:
				if jellyfish.hamming_distance(seq2, canonSeq) <= 1:
					#print "Here"
					fout.write(line)
					break
	fout.close()
	normalizeFreq('tmp.txt', 1)
	os.system('mv tmp.txt ' + fname)
Ejemplo n.º 30
0
def calc_distance(string1, string2, method):
    if method == "levenshtein":
        distance=jellyfish.levenshtein_distance(string1, string2)
    elif method == "damerau_levenshtein":
        distance= damerau_levenshtein_distance(string1, string2)
    elif method == "hamming_distance":
        distance= jellyfish.hamming_distance(string1, string2)
    elif method == "jaro_winkler":
        distance= jellyfish.jaro_winkler(string1, string2)
    elif method == "cosine":
        vector1 = text_to_vector(string1)
        vector2 = text_to_vector(string2)
        distance = get_cosine(vector1, vector2)
    elif method=="jaccard":
        x_set=ngrams(string1, 1)
        y_set=ngrams(string2, 1)
        distance = jaccard_similarity(x_set, y_set)

    return distance
Ejemplo n.º 31
0
def cal_str_similarity(str_1, str_2, option):
    multiset_1 = str_1.split()
    multiset_2 = str_2.split()
    # Jaccard similarity
    if option == 'JACC':
        return 1.0 - dist.jaccard(multiset_1, multiset_2)
    # Cosine similarity
    if option == 'COS':
        comm_len = len([word for word in multiset_1 if word in multiset_2])
        return comm_len * 1.0 / math.sqrt(len(multiset_1) * len(multiset_2))
    # Dice similarity
    elif option == 'DICE':
        comm_len = len([word for word in multiset_1 if word in multiset_2])
        return comm_len * 2.0 / (len(multiset_1) + len(multiset_2))
    # Edit similarity
    elif option == 'ES':
        return 1.0 - jf.levenshtein_distance(str_1, str_2) * 1.0 / max(
            len(str_1), len(str_2))
    # Hamming similarity
    elif option == 'HAMMING':
        return 1.0 - jf.hamming_distance(str_1, str_2) * 1.0 / max(
            len(str_1), len(str_2))
    # Jaro distance
    elif option == 'JARO':
        return jf.jaro_distance(str_1, str_2)
    # Jaro-Winkler distance
    elif option == 'JARO-WINKLER':
        return jf.jaro_winkler(str_1, str_2)
    # Overlap similarity
    elif option == 'OVERLAP':
        comm_len = len([word for word in multiset_1 if word in multiset_2])
        return comm_len * 1.0 / max(len(multiset_1), len(multiset_2))
    # entity string start with mention string
    elif option == 'START-WITH':
        return str_2.startswith(str_1)

    elif option == 'END-WITH':
        return str_2.endswith(str_1)

    elif option == 'SAME':
        return str_1 == str_2
Ejemplo n.º 32
0
 def string_comparison(self, text1, text2, choice='levenshtein_distance'):
     '''
     text1: String Input 1
     text2: String Input 2
     choice: 'levenshtein_distance' or 'damerau_levenshtein_distance' or 'hamming_distance' or 'jaro_distance' or 'jaro_winkler' or 'match_rating_comparison'
     '''
     # https://jellyfish.readthedocs.io/en/latest/comparison.html
     if choice == 'levenshtein_distance':
         return jellyfish.levenshtein_distance(text1, text2)
     elif choice == 'damerau_levenshtein_distance':
         return jellyfish.damerau_levenshtein_distance(text1, text2)
     elif choice == 'hamming_distance':
         return jellyfish.hamming_distance(text1, text2)
     elif choice == 'jaro_distance':
         return jellyfish.jaro_distance(text1, text2)
     elif choice == 'jaro_winkler':
         return jellyfish.jaro_winkler(text1, text2)
     elif choice == 'match_rating_comparison':
         return jellyfish.match_rating_comparison(text1, text2)
     else:
         print("Wrong Choice")
Ejemplo n.º 33
0
def classify_seq(rec1, rec2, min_len, max_len, max_ham_dists, log_p_struct):
    bases = set('ACGT')
    # Store as strings
    seq1 = str(rec1.seq)
    seq2_rc = str(rec2.seq.reverse_complement())
    loc_max_len = min(max_len, len(seq1), len(seq2_rc))

    # Find aligning sequence, indels are not allowed, starts of reads included
    sig_lens = [i for i, max_ham in zip(range(min_len, loc_max_len + 1), max_ham_dists)
                if hamming_distance(unicode(seq1[:i]), unicode(seq2_rc[-i:])) < max_ham]
    if len(sig_lens) != 1:
        return None

    seq2_len = sig_lens[0]
    seq2_match = seq2_rc[-seq2_len:]
    seq1_match = seq1[:seq2_len]

    # Get corresponding quality scores
    quals1 = rec1.letter_annotations['phred_quality'][:seq2_len]
    quals2 = rec2.letter_annotations['phred_quality'][::-1][-seq2_len:]

    # Build consensus sequence
    ML_bases = []
    for r1, q1, r2, q2 in zip(seq1_match, quals1, seq2_match, quals2):
        if r1 in bases and r1 == r2:
            ML_bases.append(r1)
        elif set([r1, r2]) <= bases and q1 > 2 and q2 > 2:
            r1_score = log_p_struct[r1][r1][q1] + log_p_struct[r1][r2][q2]
            r2_score = log_p_struct[r2][r1][q1] + log_p_struct[r2][r2][q2]
            if r1_score > r2_score:
                ML_bases.append(r1)
            else:
                ML_bases.append(r2)
        elif r1 in bases and q1 > 2:
            ML_bases.append(r1)
        elif r2 in bases and q2 > 2:
            ML_bases.append(r2)
        else:
            return None
    return ''.join(ML_bases)
Ejemplo n.º 34
0
def processMatchesHammingDistance():
    inputQueue = deque(inputList)

    row = inputQueue.pop()
    while row is not None:
        bestMatchScore = -1
        bestMatchRow = ''
        for rowToCompare in inputQueue:
            score = jellyfish.hamming_distance(row, rowToCompare)
            if bestMatchScore == -1 or score < bestMatchScore:
                bestMatchScore = score
                bestMatchRow = rowToCompare

        bestMatchPerColumn[row] = {
            'match': bestMatchRow,
            'score': bestMatchScore
        }

        if len(inputQueue) > 1:
            row = inputQueue.pop()
        else:
            return
Ejemplo n.º 35
0
def orderByRel(jobs, kw, algo):
    """Order based on algo type"""
    for i in range(len(jobs)):
        if algo == 1:
            jobs[i][5] = levenshtein_distance(jobs[i][0].strip(), kw)

        elif algo == 2:
            jobs[i][5] = damerau_levenshtein_distance(jobs[i][0].strip(), kw)

        elif algo == 3:
            jobs[i][5] = hamming_distance(jobs[i][0].strip(), kw)

        elif algo == 4:
            jobs[i][5] = 1 - jaro_distance(jobs[i][0].strip(), kw)

        elif algo == 5:
            jobs[i][5] = 1 - jaro_winkler(jobs[i][0].strip(), kw)

    # jobs.sort(jobs, key=lambda job: job[5])
    jobs_sorted = sorted(jobs, key=lambda dist: dist[5])

    return jobs_sorted
Ejemplo n.º 36
0
def cluster_singlecell_aa(singlecell_annotation, lineages, thresh):
    """Performs single-linkage clustering (using aa CDR3) on a single cell given 90% V gene similarity.

    Parameters
    ----------
    singlecell_annotation : dict
        Dictionary containing information about the annotated single cell sequence.
    lineages : dict
        Unnested dictionary of clustered annotations [(V, J, L, cluster_id)].
    threshold : float
        Distance threshold for the single-linkage clustering algorithm.

    Returns
    -------
    to_insert : np.array
        np.array of keys of lineages into which single cells clustered successfully.
    """

    v_gene = singlecell_annotation['v_gene']['gene']
    cdr3 = singlecell_annotation['junc_nt']
    cdr3_aa = translate(cdr3)
    len_cdr3 = len(cdr3)
    len_cdr3_aa = len(cdr3_aa)

    subset = [key for key in lineages
              if key[2] == str(len_cdr3)
              and key[0] in v_to_include[v_gene]]

    min_distances = np.ones(len(subset))
    for idx, vjlc in enumerate(subset):
        distances = np.zeros(len(lineages[vjlc]), dtype=np.float16)
        for idxa, annotation in enumerate(lineages[vjlc]):
            distances[idxa] = hamming_distance(cdr3_aa, translate(annotation['junc_nt']))
        distances /= len_cdr3_aa
        min_distances[idx] = (np.min(distances))

    to_insert = np.array(subset)[min_distances <= thresh]
    return to_insert
Ejemplo n.º 37
0
    def comparacion_pares(self, texto1, texto2, tipo='levenshtein', norm=None):
        """ Permite hacer comparaciones entre dos textos de entrada, de acuerdo a un tipo de \
            distancia o similitud determinado.

        :param texto1: (str) Primer texto de interés a comparar.
        :param texto2: (str) Segundo texto de interés a comparar.
        :param tipo: (str) {'damerau_levenshtein', 'levenshtein', 'hamming', 'jaro_winkler', \
            'jaro'} Valor por defecto: 'levenshtein'. Criterio de comparación a utilizar entre los textos.
        :param norm: (int) {1, 2} Valor por defecto: None. Permite normalizar \ 
            los resultados en función de la longitud de los textos. \ 
            Si norm=1 se normaliza en función al texto más corto, \ 
            si norm=2 se normaliza en función al texto de mayor extensión.
        :return: (float o int) Valor resultado de la comparación.
        """
        tipo = tipo.lower()
        if 'damerau' in tipo:
            salida = jellyfish.damerau_levenshtein_distance(texto1, texto2)
        elif 'levenshtein' in tipo:
            salida = jellyfish.levenshtein_distance(texto1, texto2)
        elif 'hamming' in tipo:
            salida = jellyfish.hamming_distance(texto1, texto2)
        elif 'winkler' in tipo:
            salida = jellyfish.jaro_winkler_similarity(texto1, texto2)
        elif 'jaro' in tipo:
            salida = jellyfish.jaro_similarity(texto1, texto2)
        else:
            print(
                'Por favor seleccione un criterio válido para comparar los strings.'
            )
            return None
        if norm in [1, 2] and 'jaro' not in tipo:
            if norm == 1:
                salida /= min(len(texto1), len(texto2))
            else:
                salida /= max(len(texto1), len(texto2))
        return salida
Ejemplo n.º 38
0
def process_dewlap_dorsal_ventral():

    args = get_args()
    filenames = getFilenames(args.input_dir)
    base_dir_name = os.path.split(args.input_dir)[-1]
    
    # SETUP DATASET
    col_headers = []
    data_set = []
    header_list = []
    
    # Sort files by tissue allowing for missspellings
    organized_by_tissue = {'dewlap':[], 'dorsal':[], 'ventral':[]}
    for count, filename in enumerate(filenames):

        fname = os.path.split(filename)[-1]
        
        for tissue in organized_by_tissue.keys():
           distances = sort([jellyfish.hamming_distance(tissue, item) for item in fname.split("_")])
           if distances[0] <= 2:
                organized_by_tissue[tissue].append(filename)

    
    print organized_by_tissue
def comparison(filename_txt, json_data, debug):
    #-----txt---------------------------------------
    with open(filename_txt + '.txt', 'r') as myfile:
        data_txt = myfile.read().upper().replace("\n", " ")

    if (debug):
        print(data_txt)
    data_txt = ''.join(re.findall("[A-Z0-9]", data_txt))
    if (debug):
        print(data_txt)
    date_txt = find_date(filename_txt + '.txt')
    if debug:
        print('text date', date_txt)
    allout = []
    if debug:
        print('json_data', json_data)

    #-----json--------------------------------------
    for text in json_data:

        #state # can create a dictionary: state - > state code
        date_json = json_data[3]  #'DateOfRegistry'
        if text == "USA":
            text = "UNITEDSTATESOFAMERICA"

        #makemodel
        if text == json_data[5]:
            list_model = []  # list of words descriping the model
            # temporal str
            textmodel = ""
            for place in range(len(text)):
                if text[place] != " ":
                    textmodel += text[place]
                else:
                    list_model.append(textmodel)
                    textmodel = ""
            list_model.append(textmodel)
            if debug:
                print('list_model1', list_model)
            for word in list_model:
                if word[0] == "(" and word[-1] == ")":
                    list_model.remove(word)
                    temp_word = word[1:-1]
                    list_model.append(temp_word)
            if debug:
                print('list_model2', list_model)

        #Remove all spaces, commas and other non numerical or ascii characters from the data and the search word
        newtext = ''.join(re.findall("[A-Z0-9]", text.upper()))

        if (debug):
            print('newtext(upper,npspace)', newtext)

        arr = []

        #Create a window of the size of the seach word and slide it over the text file
        #Calculate the levenshtein distance between the window and the search text

        for i in range(0, len(data_txt) - len(newtext)):
            window = data_txt[i:i + len(newtext)]
            d = jellyfish.levenshtein_distance(newtext, window)
            d2 = jellyfish.hamming_distance(newtext, window)
            d3 = jellyfish.jaro_winkler(newtext, window)
            arr.append([window, d, d2, d3])

        #Search which window has the smallest distance
        p = pd.DataFrame(arr,
                         columns=["word", "lev", "hamming", "jarowinkler"])
        m = p["jarowinkler"].idxmax()
        #Output that window and the match is the proportion of the matched window
        out = [p.iloc[m, 0], 100 * p.iloc[m, 3]]

        if (debug):
            print(p)
            print(p.xiloc[m])

        #date of registry
        if text == json_data[3]:
            if date_json in date_txt:
                out = [date_json, 100]

        #yearofbuilt
        if text == json_data[-1]:
            if out[1] < 100:
                out[1] = 0.0
        out.insert(0, newtext)
        allout.append(out)

    #Calculate an overall score with the average of the provided text
    out = [allout, pd.DataFrame(allout).loc[:, 2].mean()]
    return out
Ejemplo n.º 40
0
def word_similarity(
        word_to_compare='Vignir',
        list_of_words=["Heigigr", "Beðurni"],
        return_top_n=20,
        use_cut_off=False,
        cut_off=0.5,
        sim_measure='Levenshtein',  #SequenceMatcher #Jaro-Winkler #Hamming,
        min_characters=2,  #Null for no restriction,
        filter_non_capital_letters=True):
    """Compare similarity between a word and a list of words

    Returns list of similar words/names based on a similarity measure
    
    Args:
        word_to_compare (str) -word to compare with each value in list
        list_of_words (lst) - list of strings to compare against
        return_top_n (int) - return only top n 10 results based on similarity measure
        use_cut_off (bool) - whether to use a cut off value based on similarity
        cut_off (int) - cut off value
        
    Returns:
         Returns two ints; average epoc_loss and epoch_accuracy
         
    """
    word_similarity_list = []
    for word in list_of_words:
        dict_Words = {}
        dict_Words['word_to_compare'] = word_to_compare
        dict_Words['word_to_compare_against'] = word
        if sim_measure == 'Levenshtein':
            ##dict_Words['similarity']=Levenshtein.ratio(word_to_compare, word)
            dict_Words['similarity'] = jellyfish.levenshtein_distance(
                word_to_compare, word) * -1
            dict_Words['similarity_measure'] = 'Levenshtein'
        elif sim_measure == 'SequenceMatcher':
            dict_Words['similarity'] = SequenceMatcher(None, word_to_compare,
                                                       word).ratio()
            dict_Words['similarity_measure'] = 'SequenceMatcher'
            #https://docs.python.org/2.4/lib/sequencematcher-examples.html
        elif sim_measure == 'Jaro-Winkler':
            dict_Words['similarity'] = jellyfish.jaro_winkler(
                word_to_compare, word)
            dict_Words['similarity_measure'] = 'Jaro-Winkler'
        elif sim_measure == 'Hamming':
            dict_Words['similarity'] = jellyfish.hamming_distance(
                word_to_compare, word) * -1
            dict_Words['similarity_measure'] = 'Hamming'
        word_similarity_list.append(dict_Words)

    #Convert to frame
    df_word_similarity = pd.DataFrame(word_similarity_list)

    #Sort
    df_word_similarity = df_word_similarity.sort_values(by='similarity',
                                                        ascending=False)

    #Return top results
    if return_top_n > 0:
        if len(df_word_similarity) > return_top_n:
            df_word_similarity = df_word_similarity[0:return_top_n]
    else:
        return df_word_similarity[0:0]

    #Whether to use cutoff
    if use_cut_off:
        df_word_similarity = df_word_similarity[
            df_word_similarity.similarity > cut_off]

    #Filter min characters
    if min_characters > 0:
        df_word_similarity = df_word_similarity[
            df_word_similarity.word_to_compare_against.str.len() >
            min_characters]

    #Filter out words that does not start with a large character
    if filter_non_capital_letters:
        df_word_similarity = df_word_similarity[
            df_word_similarity.word_to_compare_against.str.istitle()]

    return df_word_similarity
Ejemplo n.º 41
0
def output_calculation(id_1, id_2, id_1_column1_name, id_2_column1_name, id_1_column2_name, id_2_column2_name):
    return [str(id_1), str(id_2),
            str(jellyfish.hamming_distance(id_1_column1_name, id_2_column1_name)),    ##################Change Algorithm here!!!
            str(jellyfish.hamming_distance(id_1_column2_name, id_2_column2_name))]
Ejemplo n.º 42
0
def percent_id(seq1,seq2):
    alignment = global_align(seq1,seq2)
    return (1. - hamming_distance(alignment[0],alignment[1]) / float(len(alignment[0]))) * 100.
Ejemplo n.º 43
0
	ng=ngram.NGram(pad_len=1,N=2)
	a = set(ng.ngrams(ng.pad(a)))
	b = set(ng.ngrams(ng.pad(b)))
	overlap = len(a & b)
	return overlap * 2.0/(len(a) + len(b))

def jaccard(a, b):
	ng=ngram.NGram(pad_len=1,N=2)
	a = set(ng.ngrams(ng.pad(a)))
	b = set(ng.ngrams(ng.pad(b)))
	return len(a & b) * 1.0 / len(a | b)


similarity = {
	"jaro_winckler": lambda a,b: jellyfish.jaro_winkler(a, b),
	"hamming_distance": lambda a,b:  1.0 - float(jellyfish.hamming_distance(a, b)) / max(len(a), len(b)),
	"damreau_levenshtein":lambda a,b:  1.0 - float(jellyfish.damerau_levenshtein_distance(a, b)) / max(len(a), len(b)),
	"dice":dice,
	"jaccard":jaccard
}[args.similarity]

mean = {
	"arithmetic_mean":arithmeticMean,
	"arithemtic_weighted_mean":arithmeticWeightedMean,
	"geometric_mean":geometricMean,
	"geometric_weighted_mean": geometricWeightedMean
}[args.mean]

def open_tsv(filename):
	return csv.reader(open(filename, "r"), delimiter='\t')
Ejemplo n.º 44
0
		match_tl = match_tl.search(word);
		if(len(match_tl)!=0):	
			m5 = m5 + match_tl[0][1];
		m1_tmp = 0;
		m2_tmp = 0;
		m3_tmp = 0;	
		word = unicode(word,'utf8');
		for txt in product_description:
			txt = unicode(txt,'utf8');
			a = jf.levenshtein_distance(word,txt);
			if(a>=m1_tmp):
				m1_tmp = a;	
			a = jf.damerau_levenshtein_distance(word,txt);
			if(a>=m2_tmp):
				m2_tmp = a;
			a = jf.hamming_distance(word,txt);
			if(a>=m3_tmp):
				m3_tmp = a;
		m1 = m1 + m1_tmp;
		m2 = m2 + m2_tmp;
		m3 = m3 + m3_tmp;		
		m6_tmp = 0;
		m7_tmp = 0;
		m8_tmp = 0;	
		#word = word.decode('utf-8');
		for txt in product_title:
			txt = safe_unicode(txt);
			a = jf.levenshtein_distance(word,txt);
			if(a>=m6_tmp):
				m6_tmp = a;	
			a = jf.damerau_levenshtein_distance(word,txt);