def Match(self, text):
     #tokeniz and normalize our text
     textArr = tokenize.wordpunct_tokenize(text.lower().strip())
     hits = 0
     results = []
     secondary = []
     #-tlength as we need to iterate over window size of words
     for ti in xrange(0,len(textArr)-self.tlength):
         
             for termT in self.toMatch:
                 #so whats the distance between our first token?
             
                 dist1 = editdist.distance(textArr[ti],termT[hits])
                 if  dist1 <= self.thresh:
                     if len(termT) <= 1:
                         print "got hit with %s"%termT
                         results.append(termT[hits])
                     else:
                         dist2 = editdist.distance(textArr[ti+1],termT[hits+1])
                         print "distance between %s and %s is %s" %(textArr[ti+1],termT[hits+1],dist2)
                         #WARNING: this will only work for 2-grams where the tlength is an n-gram.
                         if  dist2 <= self.thresh:
                             #we have a close hit lets check if the second term in tuple is a hit as well.
                             #hits = hits + 1
                             results.append("%s %s"%(termT[hits],termT[hits+1]))
                             #print termT
                             #print "got hit on term %s"%results
                     
             #looks like we've found a match
            
            
     #print secondary
     #we're done shit....
     return results
Ejemplo n.º 2
0
def distance(a, b):
    """find best edit distance between two strings of potentially uneven length.
    """
    la, lb = len(a), len(b)
    assert isinstance(a, basestring), isinstance(b, basestring)
    if la < lb:
        return distance(b, a)
    if la == lb:
        return ed.distance(a, b)
    else:
        dists = []
        for i in xrange(0, la-lb+1):
            dists.append(ed.distance(a[i:i+lb], b))
        return min(dists)
Ejemplo n.º 3
0
    def filter_words(self, words):
        num = Settings.get('str_extra')
        what = Settings.get('str_what')
        if what == 'r': # random
            pass
        else:
            control = self.get_list()
            if not control:
                return
            if what == 'e': # encompassing
                stream = [(sum([x.count(c) for c in control]), x) for x in words]
                #print "str:", list(stream)[0:10]
                preres = list(itertools.islice(filter(lambda x: x[0] > 0, stream), 4*num))
                #print "pre:", preres
                preres.sort(key=lambda x: x[0], reverse=True)
                words = [x[1] for x in preres]
            else: # similar
                words = filter(lambda x:
                               0 < min(
                                   editdist.distance(
                                       x.encode('latin1', 'replace'),
                                       y.encode('latin1', 'replace'))/max(len(y), len(x))
                                   for y in control) < .26, words)

        if Settings.get('str_clear') == 'r': # replace = clear
            GtkUtil.textbuf_clear(self.buf())

        self.add_list(itertools.islice(words, num))
Ejemplo n.º 4
0
    def filterWords(self, words):
        n = Settings.get('str_extra')
        w = Settings.get('str_what')
        if w == 'r': # random
            pass
        else:
            control = self.getList()
            if len(control) == 0:
                return
            if w == 'e': # encompassing
                stream = map(lambda x: (sum([x.count(c) for c in control]), x), words)
                print "str:", list(stream)[0:10]
                preres = list(islice(ifilter(lambda x: x[0] > 0, stream), 4*n))
                print "pre:", preres
                preres.sort(key=lambda x: x[0], reverse=True)
                words = map(lambda x: x[1], preres)
            else: # similar
                words = ifilter(lambda x:
                    0 < min([
                            editdist.distance(x.encode('latin1', 'replace'),
                                              y.encode('latin1', 'replace'))/max(len(y), len(x))
                                for y in control]) < .26, words)

        if Settings.get('str_clear') == 'r': # replace = clear
            self.clear()

        self.addList(islice(words, n))
Ejemplo n.º 5
0
    def matchtitle(self, gtitle, mtitle):

        short_key = mtitle.lower()
        key_title = gtitle.lower()

        exactmatched = False

        if short_key == key_title:
            exactmatched = True
        if not exactmatched:  #if can not be critical matched, try by calculate Levenshtein distance
            x = ''
            y = ''
            for ch in short_key:
                if ch <= chr(127):
                    x = x + ch
            for ch in key_title:
                if ch <= chr(127):
                    y = y + ch
            short_key = x
            key_title = y
            ed = editdist.distance(short_key, key_title)
            print ed
            if ed < 15:  #adaptable
                exactmatched = True

        return exactmatched
Ejemplo n.º 6
0
def is_similar(query, targets, n):
    """Tests target set of sequences to the query.

    Args:
        query (str): query sequence
        targets (set): unique sequences
        n (int): allowable mismatches when comparing a query to a given sequence of the targets

    Returns:
        bool

    >>> import editdist
    >>> s = "ACTGA"
    >>> ts_1 = {"ACTGG"}
    >>> ts_2 = {"ACTCC", "ACTGG"}
    >>> ts_3 = {"ACTCC", "ACTTT"}
    >>> n = 1
    >>> is_similar(s, ts_1, n)
    True
    >>> is_similar(s, ts_2, n)
    True
    >>> is_similar(s, ts_3, n)
    False
    """
    if targets:
        for target in targets:
            if editdist.distance(target, query) <= n:
                return True
    return False
Ejemplo n.º 7
0
def get_from_ref_dict(L, v_in):

    assert False
    
    best_edit_dist = 1000000000000
    best_name = None
    
    for v_key, tv in L.iteritems():
        if v_key == v_in:
            return tv

        d = editdist.distance(v_in, v_key)

        min_l = min(len(v_in), len(v_key))
        
        if v_in[:min_l] == v_key[:min_l]:
            d = 0

        if d < best_edit_dist:
            best_edit_dist = d
            best_name = tv

        if d == 0:
            break

    print "%s >>>>>> %s" % (v_in, best_name)
    
    return best_name
Ejemplo n.º 8
0
 def f(rec):
     ''' Filter function for cutsite'''
     
     cutsite = rec.seq[6: 6 + cutsite_length].tostring()
     cutsite_dist = ed.distance(target_cutsite, cutsite)
     
     return cutsite_dist <= mindist
Ejemplo n.º 9
0
def seq_matcher(name1, name2):
    name1 = unicode(
        unicodedata.normalize('NFKD', name1).encode('ascii', 'ignore'),
        'utf-8')
    name2 = unicode(name2, 'utf-8')
    name2 = unicode(
        unicodedata.normalize('NFKD', name2).encode('ascii', 'ignore'),
        'utf-8')

    soundex = fuzzy.Soundex(4)
    name1 = soundex(name1)
    name2 = soundex(name2)

    # dmeta = fuzzy.DMetaphone()
    # name1 = dmeta(name1)[0]
    # name2 = dmeta(name2)[0]

    # name1 = fuzzy.nysiis(name1)
    # name2 = fuzzy.nysiis(name2)

    m = SequenceMatcher(None, name1, name2)
    # Calculate an edit distance"abcef"
    # print 'm',m.ratio()
    e = editdist.distance(name1, name2)
    # print 'e',e
    sm = StringMatcher(seq1=name1, seq2=name2)
    # return e
    # print sm.distance()
    return sm.distance()
Ejemplo n.º 10
0
 def train(self, lemma2tokens={}):
     """
     Train the align by pairing each pair of tokens under a given lemma,
     which have an edit dist to each other < maximum alignment distance
     (i.e. also identical tokens are aligned to not over-abstract)
     """
     # declare containers for the alignment vectors:
     self.train_feature_dicts = []
     self.train_labels = []
     for lemma in lemma2tokens:
         tokens = lemma2tokens[lemma]
         for t1 in tokens:
             for t2 in tokens:
                 # combine and align items inside lemma that are close enough (or identical!):
                 if (t1 == t2) or (editdist.distance(t1, t2) < self.max_align_dist):
                     curr_dicts, curr_labels = self.transliterator.transliterate(t1, t2)
                     self.train_feature_dicts.extend(curr_dicts)
                     self.train_labels.extend(curr_labels)
     # transform the alignment vectors dicts into sklearn format
     self.vectorizer = DictVectorizer()
     self.train_X = self.vectorizer.fit_transform(self.train_feature_dicts)
     self.labelEncoder = preprocessing.LabelEncoder()
     self.train_y = self.labelEncoder.fit_transform(self.train_labels)
     self.clf = SGDClassifier(loss="log", penalty="l2")
     print("Training the SGD classifier for the alignator...")
     self.clf.fit(self.train_X, self.train_y)
     return
Ejemplo n.º 11
0
 def test_02__fuzz(self):
     for i in range(0, 32) + range(128, 1024, 128):
         for j in range(0, 32):
             a = randstring(i)
             b = randstring(j)
             dist = editdist.distance(a, b)
             self.assert_(dist >= 0)
def get_from_ref_dict(L, v_in):

    assert False

    best_edit_dist = 1000000000000
    best_name = None

    for v_key, tv in L.iteritems():
        if v_key == v_in:
            return tv

        d = editdist.distance(v_in, v_key)

        min_l = min(len(v_in), len(v_key))

        if v_in[:min_l] == v_key[:min_l]:
            d = 0

        if d < best_edit_dist:
            best_edit_dist = d
            best_name = tv

        if d == 0:
            break

    print "%s >>>>>> %s" % (v_in, best_name)

    return best_name
Ejemplo n.º 13
0
	def test_02__fuzz(self):
		for i in range(0, 32) + range(128, 1024, 128):
			for j in range(0, 32):
				a = randstring(i)
				b = randstring(j)
				dist = editdist.distance(a, b)
				self.assert_(dist >= 0)
Ejemplo n.º 14
0
def cal_distance(x1, x2, method='euclidean'):
    if method == 'euclidean':
        diff = x1 - x2
        sq_diff = diff * diff
        return(np.sqrt(np.sum(sq_diff)))
    elif method == 'editdist':
	return(editdist.distance(x1, x2))
Ejemplo n.º 15
0
def cal_distance(x1, x2, method='euclidean'):
    if method == 'euclidean':
        diff = x1 - x2
        sq_diff = diff * diff
        return (np.sqrt(np.sum(sq_diff)))
    elif method == 'editdist':
        return (editdist.distance(x1, x2))
Ejemplo n.º 16
0
def similarity_unordered(text1, text2):
    """Calculates the similarity between two short strings.

    This is done by sorting the tokens in the string and then calculating the Levenstein Edit Distance
    via http://www.mindrot.org/projects/py-editdist/

    Returns a value between 0 and 1.

    >>> similarity_unordered('Ich bin müde', 'Ich bin müde')
    1.0
    >>> similarity_unordered('Ich bin müde', 'müde bin ich')
    1.0
    >>> similarity_unordered('Ich bin müde', 'Ich bin rüde')
    0.92307692307692313
    >>> similarity_unordered('Ich bin müde', 'Ich bin prüde')
    0.85714285714285721
    >>> similarity_unordered('Ich bin müde', 'Wenn dein starker Arm es will, stehen alle Räder still.')
    0.10909090909090913
    """

    if not editdist:
        # Module not installed, default to 1
        return 1.0
    tokens1 = ' '.join(sorted(tokenize_text(text1)))
    tokens2 = ' '.join(sorted(tokenize_text(text2)))
    maxlen = float(max([len(tokens1), len(tokens2)]))
    distance = editdist.distance(tokens1.encode('utf-8'), tokens2.encode('utf-8'))
    divisor = max([distance, maxlen])
    if not divisor:
        print repr(text1), repr(text1), maxlen, distance
        print repr(tokens1), repr(tokens2), maxlen, distance
        return 0
    return 1 - (distance / divisor)
Ejemplo n.º 17
0
def main(argv):
    words = set()

    for filename in argv[1:]:
        with open(filename) as f:
            for line in f:
                for word in line.strip().split(' '):
                    # TODO: remove puncts?
                    words.add(word)

    words = list(words)
    random.shuffle(words)

    pos = random.randint(0, len(words) - 1)
    chosen_word = words[pos]
    words = words[:pos] + words[pos+1:]
    sys.stdout.write(chosen_word + ' ')

    while words:
        best_dist = 10000000000
        best_pos = None
        for i in xrange(0, len(words)):
            word = words[i]
            dist = editdist.distance(chosen_word, word)
            if dist < best_dist:
                #print word, dist
                best_dist = dist
                best_pos = i

        pos = best_pos
        chosen_word = words[pos]
        words = words[:pos] + words[pos+1:]
        sys.stdout.write(chosen_word + ' ')

    sys.stdout.write('\n')
Ejemplo n.º 18
0
    def matchtitle(self,gtitle,mtitle):

        short_key = mtitle.lower()
        key_title = gtitle.lower()
          
        exactmatched = False                             

        if short_key==key_title:
            exactmatched = True
        if not exactmatched:#if can not be critical matched, try by calculate Levenshtein distance
            x = ''
            y = ''
            for ch in short_key:
              if ch<=chr(127):
                x = x+ch
            for ch in key_title:
              if ch<=chr(127):
                y = y+ch
            short_key = x
            key_title = y
            ed = editdist.distance(short_key,key_title)
            print ed                    
            if ed < 15:#adaptable
              exactmatched = True

        return exactmatched
Ejemplo n.º 19
0
def apply_baseline(train_toks, train_lems, test_toks, test_lems):
    print("Calculating baseline")
    train_dict = {}

    for tok, lem in zip(train_toks, train_lems):
        if tok not in train_dict:
            train_dict[tok] = {}
        if lem not in train_dict[tok]:
            train_dict[tok][lem] = 0
        train_dict[tok][lem] += 1

    silver_lemmas = []
    for test_tok, test_lem in zip(test_toks, test_lems):
        # shortcut:
        if test_tok in train_dict:
            k = test_tok
        else:
            candidates = train_dict.keys()
            distances = [(editdist.distance(test_tok, c), c)
                         for c in candidates]
            k = min(distances, key=itemgetter(0))[1]
        silver_lem = max(train_dict[k].iteritems(), key=itemgetter(1))[0]
        silver_lemmas.append(silver_lem)

    return silver_lemmas
Ejemplo n.º 20
0
	def pickRecurse(self, track, info):
		if len(info) == 0: #if this is the last level in the trie, find the earliest year
			closest = min(self)
			return self[closest]
		else: #if not, find the key with the closest distance
			closest = min([(distance(x, info[0]), x) for x in self])[1]
			return self[closest].pickRecurse(track, info[1:])
Ejemplo n.º 21
0
	def findBestResponse(self, input):
		bestDistance = maxint
		for query in self.convoDict:
			tempDist = distance(query, input)
			if tempDist < bestDistance and len(self.convoDict[query]) > 0:
				bestResponses = self.convoDict[query]
				bestDistance = tempDist
		return choice(bestResponses)
Ejemplo n.º 22
0
def trim_loc(a, b):
    """find best edit distance and return its index else return length of a"""
    la, lb = len(a), len(b)
    dists = []
    for i in xrange(0, la-lb+1):
        dists.append(ed.distance(a[i:i+lb], b))
    best = min(dists)
    # 20% mismatch okay for now
    return dists.index(best) if best < .2*lb else la
Ejemplo n.º 23
0
def find_filename(string, match):
    editD = 1000
    fn = ''
    for f in match:
        dis = editdist.distance(string, match[f])
        if dis < editD:
            editD = dis
            fn = f
    return f
Ejemplo n.º 24
0
    def calculate_majorseq(self, clusterids=None, table_prefix=None, seq_start_idx=6):
        """" Calculate majority sequence observed in clusters and what percentage of the cluster 
        this makes up. Note that this can be different to the representative sequence. 
        
        Also calculates a measure of self similarity, measureing the percentage of reads 1 edit dist away, 
        2 edit dists away               
        """

        if table_prefix is None:
            members_table_name = 'members'
            cluster_table_name = 'clusters'
        else:
            members_table_name = table_prefix + '_members'
            cluster_table_name = table_prefix + '_clusters'

        if clusterids is None:
            # Find Last cluster id 
            c = self.con.execute(''' SELECT COUNT(*) FROM {0}'''.format(cluster_table_name))
            clusterid_max = c.fetchone()['count(*)']
            clusterids = range(1, clusterid_max + 1)

        for cid in clusterids:

            cluster = self.get_cluster_by_id(cid, items=['seqid', 'seq'], table_prefix=table_prefix)

            # Fetch all unique seq data and find most common 
            cluster.get_unique_seq(seq_start_idx=seq_start_idx, db=self)
            majorSeq = cluster.unique_seqs.most_common()[0][0]

            if majorSeq != cluster.rep_seq[seq_start_idx:]:
                majorSeqIsRepSeq = False
            else:
                majorSeqIsRepSeq = True

            majorSeqPerc = (cluster.unique_seqs.most_common()[0][1] / float(cluster.size)) * 100

            # Calculate metric for self similarity 
            selfsimilarity = []
            # First work out lev distance between top 5 unique seqs
            top5seqs = cluster.unique_seqs.most_common()[:5]

            # selfsimilarity = [( cumulative_percentage, edit distance), ... (  )]
            for idx, (seq, count) in enumerate(top5seqs):
                if idx != 0:
                    perc = (int((count / float(cluster.size)) * 100) * 100) / 100.0
                    d = ed.distance(majorSeq, seq)
                    selfsimilarity.append((perc, d))

            # Update info for cluster
            with self.con as con:

                sql_query = '''UPDATE {0} SET majorSeq = ?, majorSeqIsRepSeq = ?,
                                majorSeqPerc = ?, selfsimilarity = ? WHERE clusterid = ?'''.format(
                    cluster_table_name)

                con.execute(sql_query, (majorSeq, majorSeqIsRepSeq, majorSeqPerc,
                                        str(selfsimilarity), cid))
Ejemplo n.º 25
0
def fuzzy_wellbc_match(obs_wellbc, well_barcodes, start_pos, end_pos):
    '''
    This function takes a read and searches for supplied barcode sequences.
    
    Parameters
    ----------
    obs_wellbc: str, fastq sequence before the first instance of AD1
        e.g. ATGCATG
    
    well_barcodes: list, expected well barcodes
        e.g. ATGCATG
    
    start_pos: int, limits the string search space of the obs_wellbc
    
    end_position: int, limits the string search space of the obs_wellbc
    
    Returns
    -------
    The expected barcode found in the obs_wellbc OR 'mismatch' (if no barcode is found) 
    
    
    '''

    assert type(start_pos) == int, 'start_pos should be an int'
    assert type(end_pos) == int, 'end_pos should be an int'

    #initializing obs_wellbc variable and set of expected well barcodes
    FASTQ, bc_set = obs_wellbc.upper(), set(well_barcodes)

    #limit search for exact matches to [: end_pos]
    matches = set(FASTQ[n:n + 8] for n in range(0, end_pos)
                  if FASTQ[n:n + 8] in bc_set)

    #DEPRECATED
    #[matches.add(FASTQ[n:n+8]) for n in range(0, end_pos) if FASTQ[n:n+8] in bc_set]

    #RETURNS EXPECTED BARCODE SEQUENCE IF UNIQUE MATCH FOUND
    if len(matches) == 1:
        return (0, list(matches)[0])  #return the single best match

        #BRUTE FORCE SEARCH OF obs_wellbc subsequence
    else:
        matches = set()
        for bars in well_barcodes:  #Differentiate between more than 1 exact match, or find fuzzy matches
            BARS = bars.upper(
            )  #added a bit of ADAPTOR1 to make the mappings more stringent
            edist = editdist.distance(FASTQ, BARS)
            delta_8 = abs(len(FASTQ) -
                          8)  #correcting for difference in string seq lengths
            if edist < 2:
                return (edist, BARS)
            else:
                if edist + delta_8 < 2:  #looser thresholds performed poorly and only added maybe a couple hundred reads out of a million
                    matches.add((edist, bars))
        if len(matches) > 0:
            return (len(matches), ";".join([i[1] for i in matches]))
        return (8, "mismatch")
Ejemplo n.º 26
0
 def f(rec):
     ''' Filter function for cutsite'''
     
     cutsite = rec.seq[6: 6 + cutsite_length].tostring() 
     if cutsite.endswith(overhang):
         return True
     else:
         overhang_dist = ed.distance(cutsite[-overhang_length:], overhang)
         return overhang_dist <= mindist
def extract_translations(dict_csv_file):
    reader = csv.reader(open(dict_csv_file), dialect=csv.excel)
    headers = {}
    translations_by_worker = {}
    worker_stats = {}
    for i, header in enumerate(reader.next()):
        headers[header] = i
    for row in reader:
        workerID = row[headers['WorkerId']]
        status = row[headers['AssignmentStatus']]
        if status == 'Approved':
            for i in range(1, 13):
                word = row[headers['Input.word_' + str(i)]].decode('utf8')
                translation = row[headers['Answer.translation_' + str(i) +
                                          '_1']].decode('utf8')
                if not word in translations_by_worker:
                    translations_by_worker[word] = {}
                translations_by_worker[word][workerID] = translation
                if (i <= 2):
                    gold = row[headers['Input.translation_' +
                                       str(i)]].decode('utf8')
                    try:
                        edit_distance = float(
                            editdist.distance(gold.lower(),
                                              translation.lower())) / len(gold)
                    except:
                        edit_distance = 1
                    if not workerID in worker_stats:
                        worker_stats[workerID] = {}
                        worker_stats[workerID]['num_translations'] = 0
                        worker_stats[workerID]['total_edit_distance'] = 0
                    worker_stats[workerID]['num_translations'] += 1
                    worker_stats[workerID][
                        'total_edit_distance'] += edit_distance
    # calculate the performance of each worker
    for workerID in worker_stats:
        num_translations = worker_stats[workerID]['num_translations']
        total_edit_distance = worker_stats[workerID]['total_edit_distance']
        avg_edit_distance = total_edit_distance / num_translations
        worker_stats[workerID]['avg_edit_distance'] = avg_edit_distance
    # extract the best translations
    best_translations = {}
    for word in translations_by_worker:
        best_translation = ''
        best_edit_distance = 1000
        for workerID in translations_by_worker[word]:
            if worker_stats[workerID][
                    'avg_edit_distance'] <= best_edit_distance:
                best_translation = translations_by_worker[word][workerID]
                best_edit_distance = worker_stats[workerID][
                    'avg_edit_distance']
        if best_translation != '':
            best_translation = best_translation.replace(' ', '_')
            best_translations[word] = best_translation
    print len(best_translations.keys())
    return best_translations
Ejemplo n.º 28
0
	def matchAuthors_strict_v1(self, google_author_string, authors, debug_output=False):
		'''If the two author string matched, return True
		@return: boolean
		@param: 
		- google_author_string, e.g.
			… DeSmedt, W Du, W Kent, MA Ketabchi, WA … - …, 1991 - doi.ieeecomputersociety.org
			R Ahmed, P DeSmedt, W Du, W Kent, MA … - …, 1991 - doi.ieeecomputersociety.org
			 
		- authors, e.g.
			Rafi Ahmed,Philippe De Smedt,Weimin Du,William Kent,Mohammad A. Ketabchi,Witold Litwin,Abbas Rafii,Ming-Chien Shan

		'''
		ignore_sign = '&hellip;'
#		ignore_sign = '…'

		# process google part
		mark = google_author_string.find(' - ')
		if mark != -1:
			google_author_string = google_author_string[:google_author_string.find(' - ')]
		google_author_string = re.sub("(<(.*?)>)", "", google_author_string)
		google_author_string = re.sub("[^A-Za-z0-9,\s%s]" % ignore_sign, "", google_author_string)
		google_author_string = re.sub("\\s+", " ", google_author_string)
		google_author_string = google_author_string.strip()

		ignore_left = google_author_string.startswith(ignore_sign)
		ignore_right = google_author_string.endswith(ignore_sign)
		compact_google_str = self.__trans_to_compact(google_author_string, ignore_sign)
#		print '--- ', compact_google_str

		# process author part
		compact_authors = self.__trans_to_compact(authors, ignore_sign);
#		print ',,, ', compact_authors

		# compare
		cmp_gc = ''
		cmp_db = ''
		if ignore_left and not ignore_right:  # and compact_authors.endswith(compact_google_str):
			cmp_gc = compact_google_str
			cmp_db = compact_authors[-len(compact_google_str):]
		elif not ignore_left and ignore_right:  # and compact_authors.startswith(compact_google_str):
			cmp_gc = compact_google_str
			cmp_db = compact_authors[:len(compact_google_str)]
		elif ignore_left and ignore_right and compact_authors.find(compact_google_str) != -1:
			return True  # todo
		elif not ignore_left and not ignore_right:  # and compact_authors == compact_google_str:
			cmp_gc = compact_google_str
			cmp_db = compact_authors
		else:
			return False

		edd = editdist.distance(cmp_gc, cmp_db)
		if edd > 0:
			if debug_output:
				print '[ERR] editdist for "%s" and "%s" is %s' % (cmp_gc, cmp_db, edd)
		if edd <= 2:
			return True
Ejemplo n.º 29
0
def main(argv):
    filenames = argv[1:]

    words = []

    for filename in tqdm(filenames):
        with open(filename, 'r') as f:
            for line in f:
                bits = line.strip().split()
                for bit in bits:
                    words.extend(bit.split('--'))

    sentences = []
    sentence = []
    for word in tqdm(words):
        if word.startswith(('"', "'")):
            word = word[1:]
        if word.endswith(('"', "'")):
            word = word[:-1]
        sentence.append(word)
        if word not in ('Mr.', 'Mrs.', 'Dr.') and word.endswith(
            ('.', '!', '?')):
            sentences.append(sentence)
            sentence = []

    sentences.append(sentence)

    for sentence in sentences:
        distances = {}  # frozenset of two (word, pos) tuples -> distance
        for (pos1, word1) in enumerate(sentence):
            for (pos2, word2) in enumerate(sentence):
                if word1 == word2:
                    continue
                if MIN_LENGTH:
                    if len(word1) < MIN_LENGTH or len(word2) < MIN_LENGTH:
                        continue
                dist = editdist.distance(word1, word2)
                pair = frozenset([(word1, pos1), (word2, pos2)])
                if pair in distances:
                    assert distances[pair] == dist
                distances[pair] = dist

        smallest_distance = 100000000
        smallest_pair = None
        for pair, distance in distances.iteritems():
            if distance < smallest_distance:
                smallest_distance = distance
                smallest_pair = pair

        if smallest_pair is not None:
            smallest_pair = list(smallest_pair)
            (word1, pos1) = smallest_pair[0]
            (word2, pos2) = smallest_pair[1]
            sentence[pos2] = word1
            sentence[pos1] = word2
            print ' '.join(sentence)
Ejemplo n.º 30
0
def same(d1, d2, value_of_same=0.1):
    #dis = editdist.distance(d1.upper(), d2.upper().encode('utf-8')) 
    #editdistance can accept two str or two unicode but if mismatch it will convert using ascii which doesn't work
    dis = editdist.distance(unicode_to_str(remove_uneeded(d1.upper())), unicode_to_str(remove_uneeded(d2.upper())) )
    if len(d2)> len(d1):
        longest_len = len(d2)
    else:
        longest_len =len(d1)
    levensthein_to_len = (1.0 * dis)/ longest_len
    return levensthein_to_len < value_of_same
Ejemplo n.º 31
0
def tokens_are_similar(t1,t2):
  similarity = 1 - 1.0 * editdist.distance(t1, t2) / max(len(t1), len(t2))
  if similarity == 1 or (len(t1) > 3 and similarity > 0.5):
    return True

#  dist = embeddings_dist(t1, t2)
#  if dist < 0.001:
#    print 'embeddings think those are similar: {0}, {1}'.format(t1, t2)
#    return True
  
  return False
Ejemplo n.º 32
0
        def cutsite_filter(rec):
            """ Filter function for cutsite """

            cutsite = rec.seq[midtag_length: midtag_length + cutsite_length].tostring()

            for target_site in target_cutsites:
                cutsite_dist = editdist.distance(target_site, cutsite)
                if cutsite_dist <= max_edit_dist:
                    return True

            return False
Ejemplo n.º 33
0
def fuzzy_wellbc_match(obs_wellbc, well_barcodes, start_pos, end_pos):
    '''
    This function takes a read and searches for supplied barcode sequences.
    
    Parameters
    ----------
    obs_wellbc: str, fastq sequence before the first instance of AD1
        e.g. ATGCATG
    
    well_barcodes: list, expected well barcodes
        e.g. ATGCATG
    
    start_pos: int, limits the string search space of the obs_wellbc
    
    end_position: int, limits the string search space of the obs_wellbc
    
    Returns
    -------
    The expected barcode found in the obs_wellbc OR 'mismatch' (if no barcode is found) 
    
    
    '''
    
    assert type(start_pos) == int, 'start_pos should be an int'
    assert type(end_pos) == int, 'end_pos should be an int'
    
    #initializing obs_wellbc variable and set of expected well barcodes
    FASTQ, bc_set = obs_wellbc.upper(), set(well_barcodes) 
    
    #limit search for exact matches to [: end_pos]
    matches = set(FASTQ[n:n+8] for n in range(0, end_pos) if FASTQ[n:n+8] in bc_set)
    
    #DEPRECATED
    #[matches.add(FASTQ[n:n+8]) for n in range(0, end_pos) if FASTQ[n:n+8] in bc_set]
    
    
    #RETURNS EXPECTED BARCODE SEQUENCE IF UNIQUE MATCH FOUND
    if len(matches) == 1: return (0, list(matches)[0])  #return the single best match

    #BRUTE FORCE SEARCH OF obs_wellbc subsequence
    else: 
        matches = set()
        for bars in well_barcodes:  #Differentiate between more than 1 exact match, or find fuzzy matches
            BARS = bars.upper()  #added a bit of ADAPTOR1 to make the mappings more stringent
            edist = editdist.distance(FASTQ, BARS)
            delta_8 = abs(len(FASTQ) - 8)  #correcting for difference in string seq lengths
            if edist < 2:
                return (edist, BARS)
            else:
                if edist + delta_8 < 2:  #looser thresholds performed poorly and only added maybe a couple hundred reads out of a million
                    matches.add((edist, bars))
        if len(matches) >0: return (len(matches), ";".join([i[1] for i in matches]) )
        return (8, "mismatch")
Ejemplo n.º 34
0
        def overhang_filter(rec):
            ''' Filter function for cutsite'''

            cutsite = rec.seq[midtag_length: midtag_length + cutsite_length].tostring()

            for i, pat in enumerate(overhang_patterns):

                dist = editdist.distance(target_cutsites[i], cutsite)
                if dist <= max_edit_dist:
                    if cutsite.endswith(pat):
                        return True

            return False
def output_word(word, words):
    best_x = None
    best_dist = 1000000000
    for x, candidate in enumerate(words):
        dist = editdist.distance(word, candidate)
        if dist < best_dist:
            best_dist = dist
            best_x = x
            if best_dist == 0:
                break
    chosen = words.pop(best_x)
    sys.stdout.write(chosen + ' ')
    sys.stdout.flush()  # 'cos it's a bit pokey :)
Ejemplo n.º 36
0
def surname_compatibility(sa, sb):
    name_comparison_print('|-- Comparing surnames: %s %s'% (sa,sb))
    MAX_ALLOWED_SURNAME_DISTANCE_PERCENT = 0.33
    sa = clean_name_string(sa, replacement='', keep_whitespace=False, trim_whitespaces=True)
    sb = clean_name_string(sb, replacement='', keep_whitespace=False, trim_whitespaces=True)
    dist = distance(sa, sb)
    ml = float(max(len(sa),len(sb)))
    name_comparison_print('|--- dist:%s, ml:%s' % (dist,ml))

    if ml==0 or dist/ml > MAX_ALLOWED_SURNAME_DISTANCE_PERCENT:
        return 0.0
    else:
        return 1.-float(dist)/max(len(sa),len(sb))
Ejemplo n.º 37
0
def test_levenshtein(class_data, query):
	all_levenshtein = []
	for i, class_datum in enumerate(class_data):
		class_title = class_datum.get('full_title')
		all_levenshtein.append((i, editdist.distance(query, class_title.encode('utf-8'))))
	all_levenshtein_sorted = sorted(all_levenshtein,key=lambda x: x[1])
	top_lev = []
	for i, tup in enumerate(all_levenshtein_sorted):
		pos, lev = tup
		print "{0}: '{1}'' with levenshtein distance of {2}".format(i, class_data[pos].get('full_title'), lev)
		top_lev.append(class_data[pos].get('title'))
		if i == 9:
			break
Ejemplo n.º 38
0
def surname_compatibility(sa, sb):
    name_comparison_print('|-- Comparing surnames: %s %s'% (sa,sb))
    MAX_ALLOWED_SURNAME_DISTANCE_PERCENT = 0.33
    sa = clean_name_string(sa, replacement='', keep_whitespace=False, trim_whitespaces=True)
    sb = clean_name_string(sb, replacement='', keep_whitespace=False, trim_whitespaces=True)
    dist = distance(sa, sb)
    ml = float(max(len(sa),len(sb)))
    name_comparison_print('|--- dist:%s, ml:%s' % (dist,ml))

    if ml==0 or dist/ml > MAX_ALLOWED_SURNAME_DISTANCE_PERCENT:
        return 0.0
    else:
        return 1.-float(dist)/max(len(sa),len(sb))
Ejemplo n.º 39
0
 def _min_names_screwup_list(nalo, nalt):
     nalo = list(nalo)
     nalt = list(nalt)
     sl = []
     for n in nalo:
         maxs = max(len(n), max((len(k) for k in nalt)))
         all_scr = [distance(n,k) for k in nalt]
         mins = min(all_scr)
         sl.append((mins,maxs))
         nalt.pop(all_scr.index(mins))
         if len(nalt) < 1:
             break
     return sl
Ejemplo n.º 40
0
 def _min_names_screwup_list(nalo, nalt):
     nalo = list(nalo)
     nalt = list(nalt)
     sl = []
     for n in nalo:
         maxs = max(len(n), max((len(k) for k in nalt)))
         all_scr = [distance(n, k) for k in nalt]
         mins = min(all_scr)
         sl.append((mins, maxs))
         nalt.pop(all_scr.index(mins))
         if len(nalt) < 1:
             break
     return sl
Ejemplo n.º 41
0
def get_primer(query, primers, n):
    """return the primer name and the length to trim."""
    primer = ""
    distance = n + 1
    for name, target in primers.iteritems():
        d = ed.distance(query[:len(target)], target)
        if d < distance:
            distance = d
            primer = name
    if distance < n:
        return primer, len(primers[primer])
    else:
        return False, False
Ejemplo n.º 42
0
 def get_levenshtein_candidates(self, test_token="token", token2lemmas={}):
     """
     Function returns an initial, rough selection of levenshtein candidates
     """
     candidates = []
     for train_token in token2lemmas:
         train_lemmas = token2lemmas[train_token]
         # calculate the edit distance between the test_token and all seen tokens
         edit_dist =  editdist.distance(test_token, train_token)
         # append the training item as a candidate if it is close enough:
         if edit_dist <= self.max_lev_dist:
             candidates.append([train_token, train_lemmas, edit_dist])
     if candidates:
         return candidates
Ejemplo n.º 43
0
 def cutsite_filter(rec):
     ''' Filter function for cutsite '''
     
     fname = self.current_file
     
     if cutsite_filter.target_file is None or cutsite_filter.target_file != fname:
         cutsite_filter.target_file = fname
         tags = self.get_data4file(fname, fields=['MIDtag'])
         cutsite_filter.MIDlength =  len(tags[0][0])
     
     cutsite = rec.seq[cutsite_filter.MIDlength: cutsite_filter.MIDlength + cutsite_length].tostring()
     cutsite_dist = ed.distance(target_cutsite, cutsite)
     
     return cutsite_dist <= max_edit_dist
Ejemplo n.º 44
0
def compareNamesLc(name, myText):
    myTextOrig = myText
    nameOrig = name
    nameIsAcronym = 0
    myTextOrigList = myText.split()
    if name.isupper():
        nameIsAcronym = 1
    else:
        myText = string.lower(myText).strip()
        name = string.lower(name).strip()
    nameList = name.split()
    lengthName = len(nameList)
    myTextList = myText.split()
    lengthText = len(myTextList)
    resultHits = []
    for t in range(0, lengthText - lengthName + 1):
        testName = " ".join(myTextList[t:t + lengthName])
        if testName == name:
            # exact match
            if nameIsAcronym:
                testNameOrig = " ".join(myTextOrigList[t:t + lengthName])
                if testNameOrig.isupper():  # match only with acronyms!
                    resultHits.append([t, t + lengthName])
            else:
                resultHits.append([t, t + lengthName])
        else:
            # fuzzy match
            charactersName = list(name)
            charactersMyText = list(testName)
            maxDistance = max(1,
                              len(charactersName) / 7)  # just some heuristic
            if len(charactersName) < 2 or abs(
                    len(charactersName) - len(charactersMyText)) > maxDistance:
                continue
            distanceNames = editdist.distance(name, testName)
            if distanceNames > maxDistance:
                continue
            else:
                if nameIsAcronym:  # only exact matches for acronyms
                    testNameOrig = " ".join(myTextOrigList[t:t + lengthName])
                    if testNameOrig.isupper():  # match only with acronyms!
                        logger.info("found similar but not equal names: " +
                                    name + " - " + testNameOrig)
                        resultHits.append([t, t + lengthName])
                else:
                    logger.info("found similar but not equal names: " + name +
                                " - " + testName)
                    resultHits.append([t, t + lengthName])
    return resultHits
Ejemplo n.º 45
0
def match_index(t,idx_d,idx_len=None,mismatch_allowed=1):
    '''given an index read sequence a dictionary of form {"<read_sequence>":"<index_number>" ...}
    returns <index_number> if the best match is the only index within mismatch_allowed

    '''

    # removed in variable length setup
    #if idx_len is None:
    #    idx_len = list(set([len(k) for k in idx_d]))[0]
	
    tagdist = sorted([(distance(t_this,t[:len(t_this)]),t_this) for t_this in idx_d.keys()])
    if tagdist[0][0] <= mismatch_allowed and tagdist[1][0] > mismatch_allowed:
        return idx_d[tagdist[0][1]]
    else:
        return None
Ejemplo n.º 46
0
def match_index(t, idx_d, idx_len=None, mismatch_allowed=1):
    '''given an index read sequence a dictionary of form {"<read_sequence>":"<index_number>" ...}
    returns <index_number> if the best match is the only index within mismatch_allowed

    DOES NOT check to make sure all indices are idx_len (even if left to get idx_len, i.e. no idx_len supplied)
    '''

    if idx_len is None:
        idx_len = list(set([len(k) for k in idx_d]))[0]

    tagdist = sorted([(distance(t_this, t[:idx_len]), t_this)
                      for t_this in idx_d.keys()])
    if tagdist[0][0] <= mismatch_allowed and tagdist[1][0] > mismatch_allowed:
        return idx_d[tagdist[0][1]]
    else:
        return None
Ejemplo n.º 47
0
def compare_strings(str1, str2):
    """Compares 2 strings with the Levenshtein distance and returns a normalized
    value between 0.0 and 1.0 (meaning totally different and exactly the same
    respectively."""
    if is_editdist_loaded:
        if str1 == str2:
            return 1.0
        max_len = max(len(str1), len(str2))
        if max_len == 0:
            return 0.0
        distance = editdist.distance(str1, str2)
        return (max_len - distance) / float(max_len)
    else:
        # the edit distance module is not loadable, we have to fail the comparison
        # all the strings will be treated as completely different
        return 0.0
def clean_list(L, sub_dict):

    i = 0
    while i + 1 < len(L):
        s1 = L[i]
        s2 = L[i + 1]

        if s1 == s2[:len(s1)]:

            suffix = s2[len(s1):]
            if suffix.strip().startswith(':'):
                sub_dict[L[i + 1].strip().lower()] = L[i]
                del L[i + 1]
                continue

            elif suffix.startswith(' '):
                sub_dict[L[i].strip().lower()] = L[i + 1]
                del L[i]
                continue

        d = editdist.distance(s1.lower(), s2.lower())

        if (s1.lower() == s2.lower() or d <= 2
                or s1.lower().replace(' ', '') == s2.lower().replace(' ', '')):

            if len(s1) > len(s2):
                sub_dict[L[i + 1].strip().lower()] = L[i]
                del L[i + 1]
            else:
                sub_dict[L[i].strip().lower()] = L[i + 1]
                del L[i]

            continue

        try:
            if re.match(s1.replace('.', '[A-Za-z]+'), s2) is not None:
                sub_dict[L[i].strip().lower()] = L[i + 1]
                del L[i]
                continue
        except:
            pass

        i += 1
Ejemplo n.º 49
0
def initials_compatibility(ia, ib):
    max_n_initials = max(len(ia), len(ib))
    initials_intersection = set(ia).intersection(set(ib))
    n_initials_intersection = len(initials_intersection)
    initials_union = set(ia).union(set(ib))
    n_initials_union = len(initials_union)
    initials_distance = distance("".join(ia), "".join(ib))

    name_comparison_print('|-- Comparing initials, %s %s' % (ia, ib))
    name_comparison_print('|--- initials distance %s' % (initials_distance))

    if n_initials_union > 0:
        initials_c = float(n_initials_intersection) / float(n_initials_union)
    else:
        initials_c = 1

    name_comparison_print('|--- initials c %s' % (initials_c))

    if len(ia) > len(ib):
        alo = ia
        alt = ib
    else:
        alo = ib
        alt = ia
    lo = len(alo)
    lt = len(alt)
    if max_n_initials > 0:
        initials_screwup = sum([i + 1 for i, k in enumerate(reversed(alo))
                            if lo - 1 - i < lt and k != alt[lo - 1 - i] ]) / \
                            float(float(max_n_initials * (max_n_initials + 1)) / 2)
        initials_distance = float(initials_distance) / max_n_initials
    else:
        initials_screwup = 0
        initials_distance = 0

    name_comparison_print('|--- initials screwup, %s ' % (initials_screwup))
    name_comparison_print('|--- initials distance, %s' % (initials_distance))

    return max(
        0.0, 0.8 * initials_c + 0.1 * (1 - initials_distance) + 0.1 *
        (1 - initials_screwup))
Ejemplo n.º 50
0
    def matchAuthors_strict_v1(self,
                               google_author_string,
                               authors,
                               debug_output=False):
        '''If the two author string matched, return True
		@return: boolean
		@param: 
		- google_author_string, e.g.
			… DeSmedt, W Du, W Kent, MA Ketabchi, WA … - …, 1991 - doi.ieeecomputersociety.org
			R Ahmed, P DeSmedt, W Du, W Kent, MA … - …, 1991 - doi.ieeecomputersociety.org
			 
		- authors, e.g.
			Rafi Ahmed,Philippe De Smedt,Weimin Du,William Kent,Mohammad A. Ketabchi,Witold Litwin,Abbas Rafii,Ming-Chien Shan

		'''
        ignore_sign = '&hellip;'
        #		ignore_sign = '…'

        # process google part
        mark = google_author_string.find(' - ')
        if mark != -1:
            google_author_string = google_author_string[:google_author_string.
                                                        find(' - ')]
        google_author_string = re.sub("(<(.*?)>)", "", google_author_string)
        google_author_string = re.sub("[^A-Za-z0-9,\s%s]" % ignore_sign, "",
                                      google_author_string)
        google_author_string = re.sub("\\s+", " ", google_author_string)
        google_author_string = google_author_string.strip()

        ignore_left = google_author_string.startswith(ignore_sign)
        ignore_right = google_author_string.endswith(ignore_sign)
        compact_google_str = self.__trans_to_compact(google_author_string,
                                                     ignore_sign)
        #		print '--- ', compact_google_str

        # process author part
        compact_authors = self.__trans_to_compact(authors, ignore_sign)
        #		print ',,, ', compact_authors

        # compare
        cmp_gc = ''
        cmp_db = ''
        if ignore_left and not ignore_right:  # and compact_authors.endswith(compact_google_str):
            cmp_gc = compact_google_str
            cmp_db = compact_authors[-len(compact_google_str):]
        elif not ignore_left and ignore_right:  # and compact_authors.startswith(compact_google_str):
            cmp_gc = compact_google_str
            cmp_db = compact_authors[:len(compact_google_str)]
        elif ignore_left and ignore_right and compact_authors.find(
                compact_google_str) != -1:
            return True  # todo
        elif not ignore_left and not ignore_right:  # and compact_authors == compact_google_str:
            cmp_gc = compact_google_str
            cmp_db = compact_authors
        else:
            return False

        edd = editdist.distance(cmp_gc, cmp_db)
        if edd > 0:
            if debug_output:
                print '[ERR] editdist for "%s" and "%s" is %s' % (cmp_gc,
                                                                  cmp_db, edd)
        if edd <= 2:
            return True
Ejemplo n.º 51
0
def cluster(inPairs, bcLen, minRealNum=3, minRealFrac=0.1, minMergeFactor=6):
    uniqueIDs = {}
    totalCnt = 0
    prefixLen = bcLen / 2
    misMatchCnt = [0] * (bcLen + 1)
    distCnt = [0, 0, 0, 0]

    # garbage collector barcode
    allNBarcode = "N" * bcLen

    # group reads by barcode
    for (readID, barcode) in inPairs:
        if barcode not in uniqueIDs:
            uniqueIDs[barcode] = set()
        uniqueIDs[barcode].add(readID)
        totalCnt += 1

        # count how many times each unique barcode occurs, and get the read count for the most frequent barcode
    largestUniq = 0
    uniqBCCnts = {}
    barcodeParent = {}
    childBarcodes = {}
    for barcode in uniqueIDs:
        uniqBCCnts[barcode] = len(uniqueIDs[barcode])
        barcodeParent[barcode] = "_UNKNOWN_"  #ambiguous barcode
        childBarcodes[barcode] = []
        if uniqBCCnts[barcode] > largestUniq:
            largestUniq = uniqBCCnts[barcode]

    if allNBarcode not in uniqBCCnts:
        uniqBCCnts[allNBarcode] = 0
        barcodeParent[allNBarcode] = "_SELF_"
        childBarcodes[allNBarcode] = []

    # iteration 1: mark barcodes as real or merge them with other barcodes if they are within  1 bp of a real barcode
    prefixHash = {}
    suffixHash = {}
    sortedBarcodeList = sorted(uniqBCCnts.iteritems(),
                               key=lambda x: x[1],
                               reverse=True)
    for (bcA, bcACnt) in sortedBarcodeList:
        prefix = bcA[:prefixLen]
        suffix = bcA[0 - prefixLen:]
        if prefix not in prefixHash:
            prefixHash[prefix] = []
        if suffix not in suffixHash:
            suffixHash[suffix] = []
        prefixHash[prefix].append(bcA)
        suffixHash[suffix].append(bcA)
        if bcA.find("N") == -1 and uniqBCCnts[
                bcA] > minRealFrac * largestUniq and uniqBCCnts[
                    bcA] >= minRealNum:
            barcodeParent[bcA] = "_SELF_"  # this is a real barcode
            continue
        for realBCList in (prefixHash[prefix], suffixHash[suffix]):
            for bcB in realBCList:
                if barcodeParent[bcB] != "_SELF_":
                    continue
                (similar, misMatchPos) = isSimilar(bcB, bcA, bcLen)
                if similar:
                    barcodeParent[bcA] = bcB
                    childBarcodes[bcB].append(bcA)
                    misMatchCnt[misMatchPos] += bcACnt
                    distCnt[1] += bcACnt
                    break
            if barcodeParent[bcA] != "_UNKNOWN_":
                break  # already assigned a parent in prefix list

    # iteration 2: mark barcodes as real or merge them with other barcodes if they are within 1 bp of a another real or merged barcode
    level2Parent = {}
    level3Parent = {}
    for (bcA, bcACnt) in sortedBarcodeList:
        prefix = bcA[:prefixLen]
        suffix = bcA[0 - prefixLen:]
        if barcodeParent[bcA] != "_UNKNOWN_":
            continue
        for realBCList in (prefixHash[prefix], suffixHash[suffix]):
            for bcB in realBCList:
                if barcodeParent[bcB] == "_SELF_":
                    (similar, misMatchPos) = isSimilar(bcB, bcA, bcLen)
                    if similar:
                        barcodeParent[bcA] = bcB
                        childBarcodes[bcB].append(bcA)
                        misMatchCnt[misMatchPos] += bcACnt
                        distCnt[1] += bcACnt
                        break
                    continue
                elif barcodeParent[bcB] != "_UNKNOWN_":
                    (similar, misMatchPos) = isSimilar(bcB, bcA, bcLen)
                    if similar:  # checking if bcA is within 1 bp of bcB
                        if bcA not in level2Parent:
                            level2Parent[bcA] = set()
                            distCnt[2] += bcACnt
                        level2Parent[bcA].add(
                            barcodeParent[bcB]
                        )  # do not make this parent yet: doing so will cause >2 mismatch links to parents

            # already assigned a parent: no need to check any further
            if barcodeParent[bcA] != "_UNKNOWN_" or bcA in level2Parent:
                break

        # do a complete global alignment if we do not still find similarity
        if barcodeParent[bcA] == "_UNKNOWN_" and bcA not in level2Parent:
            for (bcB, bcBCnt) in sortedBarcodeList:
                if barcodeParent[bcB] != "_SELF_":
                    continue
                if bcBCnt < minMergeFactor * bcACnt:
                    break
                editDistance = editdist.distance(bcB, bcA)
                if editDistance <= 2:
                    if bcA not in level2Parent:
                        level2Parent[bcA] = set()
                        distCnt[2] += bcACnt
                    level2Parent[bcA].add(bcB)
                    break
                elif len(bcA) == bcLen - 3:
                    if bcA == bcB[3:] or bcA == bcB[0:bcLen - 3]:
                        if bcA not in level3Parent:
                            level3Parent[bcA] = set()
                            distCnt[3] += bcACnt
                        level3Parent[bcA].add(bcB)
                        break
        if barcodeParent[
                bcA] == "_UNKNOWN_" and bcA not in level2Parent and bcA not in level3Parent:
            barcodeParent[
                bcA] = "_SELF_"  # not within 1-bp of any child of any real barcode: this must be real as well

    # clean up and make level2 parent as the full parent
    for bcA in uniqBCCnts:
        if bcA in level2Parent:
            bcAParent = list(level2Parent[bcA])[
                0]  # arbitrarily pick the first one if multiple level2 parents
            barcodeParent[bcA] = bcAParent
            childBarcodes[bcAParent].append(bcA)
            distCnt[2] += uniqBCCnts[bcA]
        elif bcA in level3Parent:
            bcAParent = list(level3Parent[bcA])[
                0]  # arbitrarily pick the first one if muliple level3 parents
            barcodeParent[bcA] = bcAParent
            childBarcodes[bcAParent].append(bcA)
            distCnt[3] += uniqBCCnts[bcA]

    #DEBUG
    #for (bcA, bcACnt) in sortedBarcodeList:
    #   if barcodeParent[bcA] == "_SELF_":
    #      distCnt[0] += bcACnt
    #      print ("\t".join((bcA,str(bcACnt))))
    #      for bcB in childBarcodes[bcA]:
    #         print("\t\t" + "\t".join((bcB, str(uniqBCCnts[bcB]))))
    #clusterInfo = []
    #clusterInfo.append(totalCnt)
    #clusterInfo.extend(distCnt)
    #clusterInfo.extend(misMatchCnt)
    #print("cluster stats:\t" + "\t".join((str(x) for x in clusterInfo)))

    # output
    readDict = {}
    for (bcA, bcACnt) in sortedBarcodeList:
        if bcA == allNBarcode:
            continue
        if barcodeParent[bcA] == "_SELF_":
            readDict[bcA] = []
            readDict[bcA].extend(uniqueIDs[bcA])
            for bcB in childBarcodes[bcA]:
                readDict[bcA].extend(uniqueIDs[bcB])

    # reformat ouput, and hack around a bug in the algorithm above
    mts = {}
    for (mt, readIds) in readDict.iteritems():
        numReads = len(readIds)
        for readId in readIds:
            if readId in mts:  # ERROR! - read assigned to more than one MT!!!!
                #print("umi_cluster: bug in barcode clustering - read assigned to more than one MT centroid, readId: {}, MT1: {}, MT2: {}".format(readId,mts[readId],mt))
                continue
            mts[readId] = (mt, numReads)

    # done
    return mts
Ejemplo n.º 52
0
def compare_names(origin_name, target_name):
    '''
    Compare two names.
    '''
    AUTHORNAMES_UTILS_DEBUG = bconfig.AUTHORNAMES_UTILS_DEBUG
    MAX_ALLOWED_SURNAME_DISTANCE = 2
    if AUTHORNAMES_UTILS_DEBUG:
        print "\nComparing: ", origin_name, ' ', target_name
    gendernames = GLOBAL_gendernames
    name_variations = GLOBAL_name_variations
    no = split_name_parts(origin_name, True, "", True)
    nt = split_name_parts(target_name, True, "", True)

    if AUTHORNAMES_UTILS_DEBUG:
        print "|- splitted no: ", no
        print "|- splitted nt: ", nt

    score = 0.0

    surname_dist = distance(no[0], nt[0])
    if AUTHORNAMES_UTILS_DEBUG:
        print "|- surname distance: ", surname_dist

    if surname_dist > 0:
        artifact_removal = re.compile("[^a-zA-Z0-9]")
        fn1 = artifact_removal.sub("", no[0])
        fn2 = artifact_removal.sub("", nt[0])

        if fn1 == fn2:
            score = 1.0
        else:
            score = max(
                0.0, 0.5 -
                (float(surname_dist) / float(MAX_ALLOWED_SURNAME_DISTANCE)))
    else:
        score = 1.0
    if AUTHORNAMES_UTILS_DEBUG:
        print '||- surname score: ', score

    initials_only = ((min(len(no[2]), len(nt[2]))) == 0)
    only_initials_available = False
    if len(no[2]) == len(nt[2]) and initials_only:
        only_initials_available = True

    if AUTHORNAMES_UTILS_DEBUG:
        print '|- initials only: ', initials_only
        print '|- only initials available: ', only_initials_available

    names_are_equal_composites = False
    if not initials_only:
        names_are_equal_composites = full_names_are_equal_composites(
            origin_name, target_name)
    if AUTHORNAMES_UTILS_DEBUG:
        print "|- equal composites: ", names_are_equal_composites

    max_n_initials = max_n_initials = max(len(no[1]), len(nt[1]))
    initials_intersection = set(no[1]).intersection(set(nt[1]))
    n_initials_intersection = len(initials_intersection)
    initials_union = set(no[1]).union(set(nt[1]))
    n_initials_union = len(initials_union)

    initials_distance = distance("".join(no[1]), "".join(nt[1]))
    if n_initials_union > 0:
        initials_c = float(n_initials_intersection) / float(n_initials_union)
    else:
        initials_c = 1

    if len(no[1]) > len(nt[1]):
        alo = no[1]
        alt = nt[1]
    else:
        alo = nt[1]
        alt = no[1]
    lo = len(alo)
    lt = len(alt)
    if max_n_initials > 0:
        initials_screwup = sum([i + 1 for i, k in enumerate(reversed(alo))
                            if lo - 1 - i < lt and k != alt[lo - 1 - i] ]) / \
                            float(float(max_n_initials * (max_n_initials + 1)) / 2)
        initials_distance = initials_distance / max_n_initials
    else:
        initials_screwup = 0
        initials_distance = 0

    score = score - (0.75 * initials_screwup + 0.10 * (1 - initials_c)\
            + 0.15 * initials_distance) * (score)
    if AUTHORNAMES_UTILS_DEBUG:
        print "|- initials sets: ", no[1], " ", nt[1]
        print "|- initials distance: ", initials_distance
        print "|- initials c: ", initials_c
        print "|- initials screwup: ", initials_screwup
        print "||- initials score: ", score

    composits_eq = full_names_are_equal_composites(no, nt)
    if len(no[2]) > 0 and len(nt[2]) > 0:
        gender_eq = full_names_are_equal_gender(no, nt, gendernames)
    else:
        gender_eq = True
    vars_eq = full_names_are_synonymous(no, nt, name_variations)
    substr_eq = full_names_are_substrings(no, nt)

    if not initials_only:
        if len(no[2]) > len(nt[2]):
            nalo = no[2]
            nalt = nt[2]
        else:
            nalo = nt[2]
            nalt = no[2]
        nlo = len(nalo)
        nlt = len(nalt)
        names_screwup_list = [(distance(k, nalt[nlo - 1 - i]), max(len(k), len(nalt[nlo - 1 - i])))
                             for i, k in enumerate(reversed(nalo)) \
                             if nlo - 1 - i < nlt]
        max_names_screwup = max(
            [float(i[0]) / i[1] for i in names_screwup_list])
        avg_names_screwup = sum([float(i[0]) / i[1] for i in names_screwup_list])\
                            / len(names_screwup_list)

    else:
        max_names_screwup = 0
        avg_names_screwup = 0

    score = score - score * 0.75 * max_names_screwup - score * 0.25 * avg_names_screwup
    if AUTHORNAMES_UTILS_DEBUG:
        print "|- max names screwup: ", max_names_screwup
        print "|- avg screwup: ", avg_names_screwup
        print "||- names score: ", score
        print "|- names composites: ", composits_eq
        print "|- same gender: ", gender_eq
        print "|- synonims: ", vars_eq
        print "|- substrings: ", substr_eq

    if vars_eq:
        synmap = [[i, j, names_are_synonymous(i, j, name_variations)]
                  for i in no[2] for j in nt[2]]
        synmap = [i for i in synmap if i[2] == True]
        if AUTHORNAMES_UTILS_DEBUG:
            print "|-- synmap: ", synmap
        for i in synmap:
            if no[2].index(i[0]) == nt[2].index(i[1]):
                score = score + (1 - score) * 0.5
            else:
                score = score + (1 - score) * 0.15
    else:
        if AUTHORNAMES_UTILS_DEBUG:
            print "|-- synmap: empty"
    if AUTHORNAMES_UTILS_DEBUG:
        print "|-- synmap score: ", score

    if substr_eq and not initials_only:
        ssmap = [[i, j, names_are_substrings(i, j)] for i in no[2]
                 for j in nt[2]]
        ssmap = [i for i in ssmap if i[2] == True]
        if AUTHORNAMES_UTILS_DEBUG:
            print "|-- substr map: ", ssmap
        for i in ssmap:
            if no[2].index(i[0]) == nt[2].index(i[1]):
                score = score + (1 - score) * 0.2
            else:
                score = score + (1 - score) * 0.05
    else:
        if AUTHORNAMES_UTILS_DEBUG:
            print "|-- substr map: empty"

    if AUTHORNAMES_UTILS_DEBUG:
        print "|-- substring score: ", score

    if composits_eq and not initials_only:
        if AUTHORNAMES_UTILS_DEBUG:
            print "|-- composite names"
        score = score + (1 - score) * 0.2
    else:
        if AUTHORNAMES_UTILS_DEBUG:
            print "|-- not composite names"
    if AUTHORNAMES_UTILS_DEBUG:
        print "|-- composite score: ", score

    if not gender_eq:
        score = score / 3.
        if AUTHORNAMES_UTILS_DEBUG:
            print "|-- apply gender penalty"
    else:
        if AUTHORNAMES_UTILS_DEBUG:
            print "|--   no  gender penalty"

    if AUTHORNAMES_UTILS_DEBUG:
        print "|-- gender score: ", score

    if surname_dist > MAX_ALLOWED_SURNAME_DISTANCE:
        score = 0.0
        if AUTHORNAMES_UTILS_DEBUG:
            print "|- surname trim: ", score
    else:
        if AUTHORNAMES_UTILS_DEBUG:
            print "|- no surname trim: ", score

    if initials_only and not only_initials_available:
        score = score * .9
        if AUTHORNAMES_UTILS_DEBUG:
            print "|- initials only penalty: ", score, initials_only, only_initials_available
    else:
        if AUTHORNAMES_UTILS_DEBUG:
            print "|- no initials only penalty", initials_only, only_initials_available

    if AUTHORNAMES_UTILS_DEBUG:
        print "||- final score:  ", score

    return score
Ejemplo n.º 53
0
def assign_read_to_indiv(line,indiv_data,mismatch_allowed=1, \
  indiv_reads_out_pattern=None,fhdict=None,passfh=None,read2_has_idx=None, \
  trim_Q2=False,min_readlen=None,lnum=4,output_lnum=4,baseQ_in=None,baseQ_out=None):
    '''given a fastq line (actually a list of [read_name,seq,qual_str]), and an indiv_data object (see get_individual_data_for_lane)

    assigns the read to an individual based on the index tag, strips the index sequence and quality positions,
    converts quality to list of integers, and returns the sampleid, sequence and quality
    
    if a pattern is specified for output (like "/path/to/per-indiv-data/%s_s_1_1_sequence.txt")
    will also generate per-individual fastqs.
    
    using a single fhdict and passfh is highly recommended (i.e. creating beforehand and passing as arguments),
    but will be generated if absent.

	FUTURE PLANS:
    if min_readlen is set, will "pass" reads shorter than min_readlen
    if trim_Q2 is True will remove all terminal quality 2 bases.
    If this reduces a read to less then min_readlen good bases, sends to pass
    
    
    returns indiv,read,qual
    
    Paired-Ends (PE) HANDLING:
    if line and indiv_reads_out_pattern are 2-tuples, treats reads as paired-end.
    This requires that read2_has_idx be either True or False
    if False, both reads handled per the index bases of line[0]
    if True, both reads assesssed for index bases, if they DO NOT DISAGREE both reads handled per consensus

    fhdict keys for PE (line is 2-tuple) are 2-tuples (<indiv>,<readnum>) i.e. (BW001,1)

    if passfh supplied, must also be 2-tuple

    returns indiv, (read1, read2), (q1, q2)
    '''

    idxlen = len(indiv_data.keys()[0])

    if isinstance(line, tuple) and len(line) == 2:
        if (isinstance(indiv_reads_out_pattern, tuple)
                and len(indiv_reads_out_pattern)
                == 2) or indiv_reads_out_pattern is None:
            if read2_has_idx is not None:
                if indiv_reads_out_pattern is not None:
                    if fhdict is None:
                        fhdict = {}
                    if passfh is None:
                        passfh = [
                            smartopen(p % 'pass', 'w')
                            for p in indiv_reads_out_pattern
                        ]

                indiv = None
                heads = [l[0] for l in line]
                ss = [l[1] for l in line]
                qstrs = [l[2] for l in line]

                if baseQ_in is None:
                    bqs = list(
                        set([
                            get_baseQ(qs) for qs in qstrs
                            if get_baseQ(qs) is not None
                        ]))
                    if len(bqs) == 1:
                        baseQ_in = bqs[0]
                    else:
                        raise ValueError, 'bqs: %s' % bqs
                if baseQ_out is None:
                    baseQ_out = baseQ_in

                if len(set([h.split()[0][:-1] for h in heads])) != 1:
                    raise ValueError, 'read headers not identical prior to last character; %s' % heads

                if read2_has_idx:  #check that indices are concordant
                    ts = [s[:idxlen] for s in ss]
                    tqs = [qstr[:idxlen] for qstr in qstrs]
                    tagdists = [
                        sorted([(distance(t_this, t), t_this)
                                for t_this in indiv_data.keys()]) for t in ts
                    ]
                    try:
                        indiv_cand = [indiv_data[tagdist[0][1]]['sampleid'] for tagdist in tagdists \
                                      if tagdist[0][0] <= mismatch_allowed and tagdist[1][0] > mismatch_allowed]
                    except:
                        indiv_cand = [indiv_data[tagdist[0][1]]['sampleid2'] for tagdist in tagdists \
                                      if tagdist[0][0] <= mismatch_allowed and tagdist[1][0] > mismatch_allowed]
                    if len(set(indiv_cand)) == 1:
                        indiv = indiv_cand[0]
                        read = [s[idxlen:] for s in ss]
                        qual = [[ord(c) - baseQ_in for c in qstr[idxlen:]]
                                for qstr in qstrs]

                else:  #dump both reads per the first
                    t = ss[0][:idxlen]  #tag from read1
                    ts = [t] * 2  # hack for getting tag into both reads, below
                    tqs = [qstrs[0][:idxlen]] * 2
                    tagdist = sorted([(distance(t_this, t), t_this)
                                      for t_this in indiv_data.keys()])
                    if tagdist[0][0] <= mismatch_allowed and tagdist[1][
                            0] > mismatch_allowed:
                        indiv = indiv_data[tagdist[0][1]]['sampleid']
                        read = [ss[0][idxlen:], ss[1]]
                        qual = [[ord(c) - baseQ_in for c in qstrs[0][idxlen:]],
                                [ord(c) - baseQ_in for c in qstrs[1]]]

                if indiv is None:
                    read = ss
                    qual = [[ord(c) - baseQ_in for c in qstr]
                            for qstr in qstrs]
                    if passfh is not None:
                        for id, s, q, fh in zip(heads, read, qual, passfh):
                            fh.write(
                                as_fq_line(
                                    id,
                                    s,
                                    q,
                                    baseQ_out,
                                    output_lnum,
                                ))
                else:
                    if indiv_reads_out_pattern is not None:
                        for h, t, tq, s, q, rn, pat in zip(
                                heads, ts, tqs, read, qual, [1, 2],
                                indiv_reads_out_pattern):
                            newhead = '%s %s:%s' % (h, t, tq)
                            try:
                                fhdict[(indiv, rn)].write(
                                    as_fq_line(newhead, s, q, baseQ_out,
                                               output_lnum))
                            except KeyError:
                                fhdict[(indiv,
                                        rn)] = smartopen(pat % indiv, 'w')
                                fhdict[(indiv, rn)].write(
                                    as_fq_line(newhead, s, q, baseQ_out,
                                               output_lnum))

                qual = [numpy.array(q, dtype=int) for q in qual]

            else:
                raise ValueError, 'read2_has_idx cannot be None for PE reads'
        else:
            raise ValueError, 'PE handling invoked, but indiv_out_pattern does not match; must be 2-tuple or None, is: %s' % indiv_reads_out_pattern
    else:

        if indiv_reads_out_pattern is not None:
            if fhdict is None:
                fhdict = {}
            if passfh is None:
                passfh = smartopen(indiv_reads_out_pattern % 'pass', 'w')

        head, s, qstr = line

        if baseQ_in is None:
            if get_baseQ(qstr) is None:
                raise ValueError, 'could not determine qual base (33 or 64): %s' % qstr
            else:
                baseQ_in = get_baseQ(qstr)
        if baseQ_out is None:
            baseQ_out = baseQ_in

        t = s[:idxlen]

        tagdist = sorted([(distance(t_this, t), t_this)
                          for t_this in indiv_data.keys()])
        if tagdist[0][0] <= mismatch_allowed and tagdist[1][
                0] > mismatch_allowed:
            indiv = indiv_data[tagdist[0][1]]['sampleid']
            read = s[idxlen:]
            qual = [ord(c) - baseQ_in for c in qstr[idxlen:]]
            if indiv_reads_out_pattern is not None:
                newhead = '%s:%s:%s' % (head, t, qstr[:idxlen])
                try:
                    fhdict[indiv].write(
                        as_fq_line(newhead, read, qual, baseQ_out,
                                   output_lnum))
                except KeyError:
                    fhdict[indiv] = smartopen(indiv_reads_out_pattern % indiv,
                                              'w')
                    fhdict[indiv].write(
                        as_fq_line(newhead, read, qual, baseQ_out,
                                   output_lnum))
        else:
            indiv = None
            read = s
            qual = [ord(c) - baseQ_in for c in qstr]
            if passfh is not None:
                passfh.write(as_fq_line(head, s, qual, baseQ_out, output_lnum))

        qual = numpy.array(qual, dtype=int)
    return indiv, read, qual
Ejemplo n.º 54
0
 def test_01__reversed_test_vectors(self):
     for b, a, score in test_vectors:
         self.assertEqual(editdist.distance(a, b), score)
Ejemplo n.º 55
0
def string_match_score(p1, p2, field):
    s1 = p1[field]
    s2 = p2[field]
    return editdist.distance(s1.lower(), s2.lower()) / float(len(s1))
Ejemplo n.º 56
0
    def matchPub(self, golist, mylist, aid):
        data = []
        fdr = open('./output/data.txt', 'r')
        for i in fdr.readlines():
            data.append(i)
        fdr.close()
        cursor_my = self.conn_my.cursor()
        print 'matching'
        '''
		automatically updates the database
		return a list of paper dics containing updating information
		'''
        table_mon = self.db_mon[self.table_mon]

        godics = golist
        mytitles = mylist[0]
        aid = mylist[1]
        print_not_matched = False
        pubs_matched = []
        pubs_not_matched = []
        #fw1 = open('C:\\Python27\\tutorial\\tutorial\\test\\%dmatched.txt'%aid,'w')
        #fw2 = open('C:\\Python27\\tutorial\\tutorial\\test\\%dfailed.txt'%aid,'w')
        fw3 = open('./output/%dmulmatched.txt' % aid, 'w')
        t = 0

        for mydic in mytitles:
            ncitation = mydic['ncitation']
            mytitle = mydic['title']
            mytitleCleaned = self.cleanGoogleTitle(mytitle)
            short_key = mytitleCleaned[1]
            matchedlist = []
            pid = mydic['pid']
            pid = pid[0]
            for godic in godics:
                start = time.time()
                gotitle = godic['title']
                _gotitle = ''
                for cha in gotitle:
                    if cha <= chr(127):
                        _gotitle = _gotitle + cha
                gotitleCleaned = self.cleanGoogleTitle(gotitle)
                key_title = gotitleCleaned[1]
                has_dot = gotitleCleaned[2]
                exactmatched = False
                if has_dot:

                    if key_title.find(short_key) != -1:
                        exactmatched = True
                        matchedlist.append(godic)
                        godics.remove(godic)

                else:

                    if key_title == short_key:
                        exactmatched = True
                        matchedlist.append(godic)
                        godics.remove(godic)

                if not exactmatched:  #if can not be critical matched, try by calculate Levenshtein distance
                    ed = editdist.distance(short_key, key_title)

                    if ed < 10:  #adaptable
                        looseValue = float(len(key_title)) * (10 / float(100))

                        if looseValue > ed:
                            matchedlist.append(godic)
                            godics.remove(godic)
                end = time.time()
                if (start - end) != 0:
                    t += 1

            if len(matchedlist) == 1:

                try:
                    pubs_matched.append({
                        'title': matchedlist[0]['title'],
                        'pid_in_mysql': pid,
                        'citation': matchedlist[0]['citation'],
                        'essay_others': godic['essay_others']
                    })
                    #fw1.write('title1:%s title2:%s citation:%s ncitation:%s pid%d\n'%(mytitle,matchedlist[0]['title'],matchedlist[0]['citation'],ncitation,pid[0]))
                except:
                    pass
                #fw1.write('title1:%s citation:%s ncitation:%s pid%d\n'%(mytitle,matchedlist[0]['citation'],ncitation,pid[0]))
            elif len(matchedlist) >= 2:
                same = False
                num = len(matchedlist)
                for i in range(0, num):
                    if i == num - 1:
                        same = True
                        break
                    if matchedlist[i] == matchedlist[i + 1]:
                        continue
                    else:
                        break
                if same:
                    try:
                        pubs_matched.append({
                            'title':
                            matchedlist[0]['title'],
                            'pid_in_mysql':
                            pid,
                            'citation':
                            matchedlist[0]['citation'],
                            'essay_others':
                            godic['essay_others']
                        })
                    except:
                        pass

                else:
                    for paper in matchedlist:
                        godics.append(paper)
                        fw3.write(
                            'title1:%s citation:%s ncitation:%s pid%d\n' %
                            (mytitle, matchedlist[0]['citation'], ncitation,
                             pid))

            else:
                #fw2.write('title:%s citation:-1\n'%mytitle)
                #pubs_not_matched.append({'title':matchedlist[0]['title'],'citation':matchedlist[0]['citation'],'essay_others':godic['essay_others']})
                continue
        #fw1.close()
        #fw2.close()
        fw3.close()
        fdw = open('./output/data.txt', 'w')
        log = '%d %d %d %f %d \n' % (aid, len(pubs_matched), len(mytitles),
                                     float(len(pubs_matched)) /
                                     float(len(mytitles)), t)
        print log
        data.append(log)
        for i in data:
            fdw.write(i)
        fdw.close()
        pubs_matched.extend(godics)
        return pubs_matched