async def choose(self, ctx, *, choices=""): """Choose between one of various supplied things. Syntax: * choose x, y, z - Choose between x, y, and z. """ choices = choices.split(",") if len(choices) <= 1: raise commands.UserInputError(("Not enough choices! " "Separate choices with commas, e.g. " "`choose A cat, A bear, A python`")) # Eliminate leading and trailing whitespace. for index in range(0, len(choices)): choices[index] = choices[index].strip() # Are they all the same? if len(set(choices)) == 1: raise commands.UserInputError("They're all the same, I can't choose!") choice = None # Loaded choice. The program biases in favor of pythons. for distance in range(0, 3): for choice_loaded in choices: if utils.levenshtein("python", choice_loaded.lower()) == distance: python = (f"{choice_loaded}, obviously", f"{choice_loaded}, duh", choice_loaded) choice = systemrandom.choice(python) break if choice: break # Couldn't find a python, so now the program actually choses randomly. if not choice: choice = systemrandom.choice(choices) logger.info(f"Chose {choice}") await ctx.send(choice)
def dl_fna(species_name): """Dl fna if necessary, return filename""" accession = dl_gbk(species_name) print "accession:",accession fna_name = accession + ".fna" print "fna_name:",fna_name target_path = os.path.join("data",species_name,fna_name) if os.path.isfile(target_path): print "found fna:",target_path return target_path print "didn't find fna for:",species_name,"downloading" host.chdir('/genomes/Bacteria/') dir_list = host.listdir(host.curdir) sorted_dir_list = sorted(dir_list,key=lambda fname:levenshtein(species_name,fname)) for dir_name in sorted_dir_list: print "trying:",dir_name try: host.chdir('/genomes/Bacteria/' + dir_name + '/') sub_dir_list = host.listdir(host.curdir) if find(lambda name:name.startswith(accession),sub_dir_list): host.download(fna_name,target_path) return target_path except: continue print "Couldn't find fna for:",species_name return None
def correct(self, string): if (len(string) - self.max_length) > self.max_distance: return [] corrections_dict = {} min_correct_len = float("inf") queue = sorted( list( set([string] + utils.generate_deletes(string, self.max_distance))), key=len, reverse=True, ) while len(queue) > 0: q_item = queue.pop(0) if (len(corrections_dict) > 0) and ( (len(string) - len(q_item)) > min_correct_len): break if (q_item in self.dictionary) and (q_item not in corrections_dict): if self.dictionary[q_item][1] > 0: corrections_dict[q_item] = ( self.dictionary[q_item][1], len(string) - len(q_item), ) if len(string) == len(q_item): break elif (len(string) - len(q_item)) < min_correct_len: min_correct_len = len(string) - len(q_item) for sc_item in self.dictionary[q_item][0]: if sc_item not in corrections_dict: if len(q_item) == len(string): item_dist = len(sc_item) - len(q_item) item_dist = utils.levenshtein(sc_item, string) if item_dist > min_correct_len: pass elif item_dist <= self.max_distance: corrections_dict[sc_item] = ( self.dictionary[sc_item][1], item_dist, ) if item_dist < min_correct_len: min_correct_len = item_dist corrections_dict = { k: v for k, v in corrections_dict.items() if v[1] <= min_correct_len } return corrections_dict
def check_vendor(vendor_name, vendor_email) -> Optional[str]: for vr in existing_vendor_list: vn = vr["vendor_name"] if vn not in vr["alias"]: vr["alias"].append(vn) for v in vr["alias"]: if levenshtein(vendor_name, v, ignore_case=True) < 2: return vn return
def get_vendor_record(vendor_name, vendor_email) -> Optional[dict]: for vr in existing_vendor_list: vn = vr["vendor_name"] if vn not in vr["alias"]: vr["alias"].append(vn) for v in vr["alias"]: if levenshtein(vendor_name, v, ignore_case=True) < 2: return vr return
def get_metropolitan_index(cname): cname = cname.lower() for suffix in ("utara", "selatan", "timor", "barat"): cname = cname.replace(' ' + suffix, '') results = sorted([(levenshtein(name, cname), name, index) for name, population, index in cities_and_towns]) if results[0][0] < 2: return max(results[0][2], 2), results[0][1] else: return 2, ''
def eval(self, data, max_iter=np.inf): data_loader = torch.utils.data.DataLoader(data, batch_size=self.opt.batchSize, num_workers=int(self.opt.workers), pin_memory=True, collate_fn=dataset.collatedict()) self.model.eval() gts = [] decoded_preds = [] val_iter = iter(data_loader) tc = 0 wc = 0 ww = 0 tw = 0 loss_avg = utils.averager() max_iter = min(max_iter, len(data_loader)) with torch.no_grad(): # print('-------Current LR-----') # for param_group in self.optimizer.param_groups: # print(param_group['lr']) # print('---------------------') for i in range(max_iter): if self.opt.mode == 'test': print('%d / %d' % (i, len(data_loader)), end='\r') output_dict = self.forward_sample(val_iter.next()) batch_size = output_dict['batch_size'] preds = F.log_softmax(output_dict['probs'], 2) preds_size = Variable(torch.IntTensor([preds.size(0)] * batch_size)) cost = self.get_loss({'preds': preds, 'batch_size': batch_size, 'preds_size': preds_size, 'params':output_dict['params']}) loss_avg.add(cost) decoded_pred = self.decoder(preds, preds_size) gts += list(output_dict['gt']) decoded_preds += list(decoded_pred) if self.mode == "train": pcounter = 0 for target, pred in zip(gts, decoded_preds): if pcounter < 5: print('Gt: ', target) print('Pred: ', pred) pcounter += 1 if target!=pred: ww += 1 tw += 1 wc += utils.levenshtein(target, pred) tc += len(target) wer = (ww / tw)*100 cer = (wc / tc)*100 return loss_avg, cer, wer else: f = open(self.opt.out, 'w') for target, pred in zip(gts, decoded_preds): f.write('{}\n{}\n'.format(pred, target)) f.close() print('Generated predictions for {} samples'.format(self.test_data.nSamples)) return
def get_wer(refs, hyps): assert len(refs) == len(hyps) total_wer = 0.0 total_tokens = 0 for ref, hyp in zip(refs, hyps): total_wer += levenshtein(ref.split(), hyp.split()) total_tokens += len(ref.split()) return total_wer / total_tokens
def soundmatch(sounda, soundb, maxpoints): """Given two sounds (suffixes or prefixes), calculate a score for their similarity.""" ## if they're both not zero if len(sounda) == 0 and len(soundb) == 0: return 0 frac = utils.levenshtein(sounda, soundb) / max(len(sounda), len(soundb)) return maxpoints * (1 - frac)
def expand(sentence): sen, abbr, and_pos = clean_abbr(sentence) expand = '' if len(dic[abbr]) == 0: return "null", -1 if len(dic[abbr]) == 1: expand = dic[abbr][0] expand = insert_va(expand, and_pos) return expand, 0 if len(dic[abbr]) >= 2: pred, score, time = evaluate(sen) tmp = len(pred) for item in dic[abbr]: if levenshtein(item, pred) < tmp: expand = item tmp = levenshtein(item, pred) if tmp > 2: expand = "null" expand = insert_va(expand, and_pos) return expand, score.item()
def get_emoji(ctx, expression: str): """Doesn't really work that well.""" bot = ctx.bot try: return bot.get_emoji(int(expression)) except Exception: pass expression = expression.strip(":").lower() for closeness in range(0, 4): for emoji in bot.emojis: if utils.levenshtein(expression, emoji.name.lower()) == closeness: return emoji
def spell_correction(misspelled_word: str): my_dict = mapping(soundex(misspelled_word)) for k in my_dict.keys(): my_dict[k] = levenshtein(misspelled_word, k) sorted_values = sorted(my_dict.values()) out = '' for i in my_dict.keys(): if (my_dict[i] == sorted_values[0]): out = f'{out},{i}' return (out[1:])
def printSearch(search_category, search_package, maxDist=2): installable = portage.PortageInstance.getInstallables() similar = [] match = None package_re = re.compile(".*%s.*" % search_package, re.IGNORECASE) for _p in installable: if search_category == "" or search_category == _p.category: package = portage.PortageInstance.getPackageInstance( _p.category, _p.package) if not package: continue levDist = utils.levenshtein(search_package.lower(), package.package.lower()) if levDist == 0: match = (levDist, package) break elif package_re.match(package.package): similar.append((levDist - maxDist, package)) elif len(package.package) > maxDist and levDist <= maxDist: similar.append((levDist, package)) else: if package_re.match(package.subinfo.shortDescription): similar.append((100, package)) if match == None: if len(similar) > 0: print("Emerge was unable to find %s, similar packages are:" % search_package) similar.sort(key=lambda x: x[0]) else: print("Emerge was unable to find %s" % search_package) else: print("Package %s found:" % search_package) similar = [match] for levDist, package in similar: EmergeDebug.debug((package, levDist), 1) print(package) print("\t Homepage: %s" % package.subinfo.homepage) print("\t Description: %s" % package.subinfo.shortDescription) print("\t Latest version: %s" % package.subinfo.defaultTarget) installed = False for pack in InstallDB.installdb.getInstalledPackages( package.category, package.package): if pack.getVersion(): installed = True print("\t Installed versions: %s" % pack.getVersion()) if pack.getRevision(): print("\t Installed revision: %s" % pack.getRevision()) if not installed: print("\t Installed versions: None")
def keywords(text, n=15): """ extract most relevant keywords from given text steps: 1. tokenize text by words 2. applying synctatic filter 3. compute pairwise levenshtein distance 4. create graph based on cosine distance matrix 5. compute pagerank - text: string consisting of a few sentences - n: number of keywords to extract """ import nltk synctatic_filter = ['NN', 'JJ'] # tokenizung by words words = word_tokenize(text) # pos-tagging tagged = nltk.pos_tag(words) #applying synctatic filter filtered = [i[0].lower() for i in tagged if i[1] in synctatic_filter] # pairwise combinations pairs = list(combinations(filtered, 2)) # compute distance between every pair and set it as weight of graph edge weighted_edges = [] for i in range(len(pairs)): # distance define as weight of edge weight = utils.levenshtein(pairs[i][0], pairs[i][1]) weighted_edges.append((pairs[i][0], pairs[i][1], weight)) # create graph G = nx.Graph() G.add_weighted_edges_from(weighted_edges) # calculate pagerank pr = nx.pagerank(G, alpha=0.85) # dict of TextRank ranking of levenshtein distance matrix ranking = Counter(pr) # top n keywords keywords, scores = list(zip(*ranking.most_common(n))) return keywords, scores
def calcsimilarity(known, table, id1, id2, comparison): tokens = re.split("[/,;]", comparison[1]) ret = False for j in xrange(0, len(known)): if i == j: continue if comparison[2] == known[j][2]: similarity = 1.0 else: compared_genre = re.split("[/,;]", known[j][1]) distance = {} sametags = 0 for a in tokens: if not a: continue for b in compared_genre: if not b or b in distance: continue if len(a) == len(b): h = hamming(a, b) / float(len(a)) if h: distance[b] = h else: sametags = sametags + 1 else: distance[b] = levenshtein(a, b) / \ float(max(len(a), len(b))) if distance: # geometric mean + weighted equal tags similarity = 1.0 - ( reduce(lambda x, y: x * y, distance.values())) ** \ (1.0 / len(distance)) + \ (sametags / (sametags + len(distance))) else: similarity = 0.0 if similarity > 0.33: if not db.execute( "select * from %s where %s = ? and %s = ?" % (table, id1, id2), (comparison[0], known[j][0])).fetchall(): db.execute( "insert or ignore into %s " "(%s, %s, similarity) values ( ?, ?, ?)" % (table, id1, id2), (comparison[0], known[j][0], similarity)) ret = True return ret
def printSearch(search_category, search_package,maxDist = 2): installable = portage.PortageInstance.getInstallables() similar = [] match = None package_re = re.compile(".*%s.*" % search_package, re.IGNORECASE) for _p in installable: if search_category == "" or search_category == _p.category: package = portage.PortageInstance.getPackageInstance( _p.category, _p.package) if not package: continue levDist = utils.levenshtein(search_package.lower(),package.package.lower()) if levDist == 0 : match = (levDist,package) break elif package_re.match(package.package): similar.append((levDist-maxDist,package)) elif len(package.package)>maxDist and levDist <= maxDist: similar.append((levDist,package)) else: if package_re.match(package.subinfo.shortDescription): similar.append((100,package)) if match == None: if len(similar)>0: print("Emerge was unable to find %s, similar packages are:" % search_package) similar.sort( key = lambda x: x[0]) else: print("Emerge was unable to find %s" % search_package) else: print("Package %s found:" % search_package) similar = [match] for levDist,package in similar: EmergeDebug.debug((package, levDist), 1) print(package) print("\t Homepage: %s" % package.subinfo.homepage) print("\t Description: %s" % package.subinfo.shortDescription) print("\t Latest version: %s" % package.subinfo.defaultTarget) installed = False for pack in InstallDB.installdb.getInstalledPackages(package.category,package.package): if pack.getVersion(): installed = True print("\t Installed versions: %s" % pack.getVersion()) if pack.getRevision(): print("\t Installed revision: %s" % pack.getRevision()) if not installed: print("\t Installed versions: None")
def printSearch(search_category, search_package,maxDist = 2): installable = portage.PortageInstance.getInstallables() similar = [] match = None package_re = re.compile(".*%s.*" % search_package.lower()) for category,package,version in installable: if search_category == "" or search_category == category: meta = portage.PortageInstance.getMetaData( category, package, version ) levDist = utils.levenshtein(search_package.lower(),package.lower()) if levDist == 0 : match = (levDist,category,package,version,meta) break; elif package_re.match(package.lower()): similar.append((levDist-maxDist,category,package,version,meta)) elif len(package)>maxDist and levDist <= maxDist: similar.append((levDist,category,package,version,meta)) else: if "shortDescription" in meta: if package_re.match(meta["shortDescription"].lower()): similar.append((100,category,package,version,meta)) if match == None: if len(similar)>0: print("Emerge was unable to find %s, similar packages are:" % search_package) similar.sort() else: print("Emerge was unable to find %s" % search_package) else: print("Package %s found:" % search_package) similar = [match] for levDist,category,package,version,meta in similar: utils.debug((category,package,version,levDist),1) description = "" if "shortDescription" in meta: description = meta["shortDescription"] homepage = "" if "homepage" in meta: homepage = meta["homepage"] #print(levDist) print("%s/%s" % (category,package)) print("\t Homepage: %s" % homepage) print("\t Description: %s" % description) print("\t Latest version: %s" % version) print("\t Installed version: %s" % InstallDB.installdb.findInstalled(category,package))
def test(self): with self.sess.as_default(): example_count = 0 total_error = 0 batch_x, batch_y, batch_length = self.data.get_next_test_batch( self.batch_size) data_targets = sparse_tuple_from(batch_y) predict_str = self.sess.run([self.decoded], feed_dict={ self.inputs: batch_x, self.seq_len: batch_length }) example_count += len(batch_y) total_error += np.sum( levenshtein(ground_truth_to_word(batch_y), ground_truth_to_word(decoded))) print "Error on test set: {}".format(total_error / example_count) return None
def getByDistance(self, name, tolerance = 10): """Returns version if there is a version within Levenshtein distance of 'tolerance' parameter for 'name' parameter. Nearest version is returned. Comparison is done case-insensitively. First appearing in versions.xml is chosen on tie. None is returned if no version is in given distance. """ nearest = None minDistance = 999 for version in self.versions: n1 = version.name.lower() n2 = name.lower() l_distance = levenshtein(n1, n2) if l_distance < tolerance and l_distance < minDistance: minDistance = l_distance nearest = version return nearest
def getByDistance(self, name, tolerance=10): """Returns version if there is a version within Levenshtein distance of 'tolerance' parameter for 'name' parameter. Nearest version is returned. Comparison is done case-insensitively. First appearing in versions.xml is chosen on tie. None is returned if no version is in given distance. """ nearest = None minDistance = 999 for version in self.versions: n1 = version.name.lower() n2 = name.lower() l_distance = levenshtein(n1, n2) if l_distance < tolerance and l_distance < minDistance: minDistance = l_distance nearest = version return nearest
def printSearch(search_package, maxDist=2): searchPackageLower = search_package.lower() isPath = "/" in searchPackageLower with CraftTimer.Timer("Search", 0) as timer: similar = [] match = None package_re = re.compile(f".*{search_package}.*", re.IGNORECASE) for searchPackage in packages(): packageString = searchPackage.path if isPath else searchPackage.name levDist = abs(len(searchPackageLower) - len(packageString)) if levDist <= maxDist: levDist = utils.levenshtein(searchPackageLower, packageString.lower()) if levDist == 0: match = (levDist, searchPackage) break elif package_re.match(searchPackage.path): similar.append((levDist - maxDist, searchPackage)) elif len(packageString) > maxDist and levDist <= maxDist: similar.append((levDist, searchPackage)) else: if package_re.match(searchPackage.description) or \ package_re.match(searchPackage.tags): similar.append((100, searchPackage)) if match is None: if len(similar) > 0: CraftCore.log.info( f"Craft was unable to find {search_package}, similar packages are:" ) similar.sort(key=lambda x: x[0]) else: CraftCore.log.info( f"Craft was unable to find {search_package}") else: CraftCore.log.info(f"Package {search_package} found:") similar = [match] for levDist, searchPackage in similar: CraftCore.log.debug((vars(searchPackage), levDist)) CraftCore.log.info(searchPackage)
def pois_v1(): global _db filter_s = unicode(request.query.get('filter', None), encoding='utf-8') if filter_s is None: abort(501, "Unfiltered searches not allowed.") result = search(database=_db, verbose=False, query=filter_s) municipality = request.query.get('municipality', None) if municipality: if municipality.lower() in municipalities_set: municipality_key = None for k, v in municipalities.iteritems(): if v.lower() == municipality.lower(): municipality_key = k break if not municipality_key: abort(501, "Unknown municipality: %s." % municipality) else: result = [r for r in result if r['municipality_id'] == municipality_key] try: result_count = int(request.query.get('resultcount', -1)) if int(result_count) != -1: for r in result: distance = levenshtein(filter_s, r['name']) r['edit_distance'] = distance result.sort(key=lambda x: x['edit_distance']) result = list(islice(result, result_count)) except: abort(501, "Cannot parse resultcount:%s." % request.query.get('resultcount')) response.content_type = 'application/json' return json.dumps(result, ensure_ascii=False)
def test(self): with self.__session.as_default(): print('Testing') total_error = 0 example_count = 0 for batch_y, batch_sl, batch_x in self.__data_manager.get_next_test_batch( ): data_targets = np.asarray([ label_to_array(lbl, config.CHAR_VECTOR) for lbl in batch_y ]) data_targets = sparse_tuple_from(data_targets) decoded = self.__session.run([self.__decoded], feed_dict={ self.__inputs: batch_x, self.__seq_len: batch_sl }) example_count += len(batch_y) total_error += np.sum( levenshtein(ground_truth_to_word(batch_y), ground_truth_to_word(decoded))) print('Error on test set: {}'.format(total_error, total_error / example_count)) return None
def main(): model = '../models/output_graph.pbmm' alphabet = '../models/alphabet.txt' lm = '../models/lm.binary' trie = '../models/trie' samples = 200 # '/Volumes/Seagate/Dataset/Coffee Shop/snr0/LibriSpeech/test_clean/wav' snr = 20 audiofolder = '/Volumes/Seagate/Dataset/Coffee Shop/snr' + str( snr) + '/librispeech_orig_cropped/test_clean/wav' transcription_folder = '/Users/shibozhang/Documents/Course/DeepLearningTopics_496/dataset/Coffee Shop/snr' + str( snr) + '/LibriSpeech/test_clean/transcripts/' reference_folder = '/Users/shibozhang/Documents/Course/DeepLearningTopics_496/dataset/LibriSpeech_dataset/raw/test_clean/txt/' result_file = '/Users/shibozhang/Documents/Course/DeepLearningTopics_496/dataset/results.txt' if not os.path.exists(transcription_folder): os.makedirs(transcription_folder) audio_files = list_files_in_directory(audiofolder) print('number of audio clips: ', str(len(audio_files))) random.shuffle(audio_files) audio_files = audio_files[0:samples] audio_list = [os.path.join(audiofolder, i) for i in audio_files] savefiles = [ os.path.join(transcription_folder, i[:-4] + '.txt') for i in audio_files ] print('Loading model from file {}'.format(model), file=sys.stderr) model_load_start = timer() ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) if lm and trie: print('Loading language model from files {} {}'.format(lm, trie), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(alphabet, lm, trie, LM_ALPHA, LM_BETA) lm_load_end = timer() - lm_load_start print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr) for audio, savefile in zip(audio_list, savefiles): fin = wave.open(audio, 'rb') fs = fin.getframerate() if fs != 16000: print( 'Warning: original sample rate ({}) is different than 16kHz. Resampling might produce erratic speech recognition.' .format(fs), file=sys.stderr) fs, audio = convert_samplerate(audio) else: audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) audio_length = fin.getnframes() * (1 / 16000) fin.close() print('Running inference.', file=sys.stderr) inference_start = timer() transcription = ds.stt(audio, fs) print(transcription) textfile = open(savefile, 'w') textfile.write(transcription) textfile.close() inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr) input_source = 'files' # input_source = 'str' separator = '\t' encoding = 'utf-8' references = [ os.path.join(reference_folder, i) for i in list_files_in_directory(transcription_folder) ] transcriptions = [ os.path.join(transcription_folder, i[:-4] + '.txt') for i in list_files_in_directory(transcription_folder) ] print(len(references)) print(len(transcriptions)) # exit() # references, transcriptions = _intersection(references, transcriptions) ref, hyp = [], [] if input_source == 'str': ref.append(reference.decode(encoding)) hyp.append(transcription.decode(encoding)) elif input_source == '-': line_n = 0 for line in sys.stdin: line_n += 1 line = line.rstrip('\n').rstrip('\r').decode(encoding) fields = line.split(separator) if len(fields) != 2: logging.warning('Line %d has %d fields but 2 were expected', line_n, len(fields)) continue ref.append(fields[0]) hyp.append(fields[1]) elif input_source == 'file': ref = load_file(reference, encoding) hyp = load_file(transcription, encoding) if len(ref) != len(hyp): logging.error( 'The number of reference and transcription sentences does not ' 'match (%d vs. %d)', len(ref), len(hyp)) exit(1) elif input_source == 'files': ref = load_file_batch(references, encoding) hyp = load_file_batch(transcriptions, encoding) if len(ref) != len(hyp): logging.error( 'The number of reference and transcription sentences does not ' 'match (%d vs. %d)', len(ref), len(hyp)) exit(1) else: logging.error('INPUT FROM "%s" NOT IMPLEMENTED', input_source) exit(1) wer_s, wer_i, wer_d, wer_n = 0, 0, 0, 0 cer_s, cer_i, cer_d, cer_n = 0, 0, 0, 0 sen_err = 0 for n in range(len(ref)): if n % 100 == 0: print('processing {}'.format(n)) # update CER statistics _, (s, i, d) = levenshtein(ref[n], hyp[n]) cer_s += s cer_i += i cer_d += d cer_n += len(ref[n]) # update WER statistics _, (s, i, d) = levenshtein(ref[n].split(), hyp[n].split()) wer_s += s wer_i += i wer_d += d wer_n += len(ref[n].split()) # update SER statistics if s + i + d > 0: sen_err += 1 if cer_n > 0: print('CER: %g%%, WER: %g%%, SER: %g%%' % ((100.0 * (cer_s + cer_i + cer_d)) / cer_n, (100.0 * (wer_s + wer_i + wer_d)) / wer_n, (100.0 * sen_err) / len(ref))) # save results textfile = open(result_file, 'a') textfile.write('\n\n' + transcription_folder) textfile.write('\nCER: %g%%, WER: %g%%, SER: %g%%' % ((100.0 * (cer_s + cer_i + cer_d)) / cer_n, (100.0 * (wer_s + wer_i + wer_d)) / wer_n, (100.0 * sen_err) / len(ref))) textfile.close()
def test_levenshtein(self): word1 = "kitten" word2 = "kitchen" self.assertEqual(levenshtein(word1, word2), 2)
def generate(self, question, babelNetCache): #min_d = sys.maxsize Q = [] l = [] for r in self.question_patterns.relation_to_questions: for q_p in self.question_patterns[r]: d = utils.levenshtein(question, q_p) q_p_pos = len(l) for k in range(len(l)): if d <= l[k]: q_p_pos = k break Q.insert(q_p_pos, q_p) l.insert(q_p_pos, d) # Consider first best T matches: T = len(Q) for q in Q[:T]: #print(q) Xpos = q.find("X") Ypos = q.find("Y") if Xpos != -1 and Ypos != -1: #print("BOTH_X_Y") # Case -- X -- Y --? if Xpos < Ypos: beforeX = q[:Xpos] afterX = q[Xpos+1:Ypos] afterY = q[Ypos+1:] conceptX_begin_idx = -1 pp_afterx = question[Xpos:].find(afterX) if pp_afterx == -1: continue conceptX_end_idx = Xpos + pp_afterx conceptY_begin_idx = -1 conceptY_end_idx = question.find(afterY) if question.find(beforeX) != -1: conceptX_begin_idx = Xpos # = len(beforeX) conceptY_begin_idx = conceptX_end_idx + len(afterX) #print("CONCEPTX_BEGIN_IDX:", conceptX_begin_idx) #print("CONCEPTX_END_IDX:", conceptX_end_idx) #print("CONCEPTY_BEGIN_IDX:", conceptY_begin_idx) #print("CONCEPTY_END_IDX:", conceptY_end_idx) # Case -- Y -- X --? else: beforeY = q[:Ypos] afterY = q[Ypos+1:Xpos] afterX = q[Xpos+1:] conceptY_begin_idx = -1 pp_aftery = question[Ypos:].find(afterY) if pp_aftery == -1: continue conceptY_end_idx = Ypos + pp_aftery conceptX_begin_idx = -1 conceptX_end_idx = question.find(afterX) if question.find(beforeY) != -1: conceptY_begin_idx = Ypos # = len(beforeY) conceptX_begin_idx = conceptY_end_idx + len(afterY) #print("CONCEPTY_BEGIN_IDX:", conceptY_begin_idx) #print("CONCEPTY_END_IDX:", conceptY_end_idx) #print("CONCEPTX_BEGIN_IDX:", conceptX_begin_idx) #print("CONCEPTX_END_IDX:", conceptX_end_idx) if conceptX_begin_idx == -1 or conceptX_end_idx == -1 or conceptY_begin_idx == -1 or conceptY_end_idx == -1: continue conceptX = question[conceptX_begin_idx:conceptX_end_idx].lower() conceptY = question[conceptY_begin_idx:conceptY_end_idx].lower() #print("conceptX:", conceptX) #print("conceptY:", conceptY) # Only X in the question: elif Ypos == -1: beforeX = q[:Xpos] afterX = q[Xpos+1:] concept_begin_idx = -1 concept_end_idx = -1 if question.find(beforeX) != -1: concept_begin_idx = Xpos # = len(beforeX) if question.find(afterX) != -1: concept_end_idx = len(question) - len(afterX) #print("ONLY_X") #print("CONCEPT_BEGIN_IDX:", concept_begin_idx) #print("CONCEPT_END_IDX:", concept_end_idx) if concept_begin_idx == -1 or concept_end_idx == -1: continue conceptX = question[concept_begin_idx:concept_end_idx].lower() #print("conceptX:", conceptX) # Only Y in the question: elif Xpos == -1: beforeY = q[:Ypos] afterY = q[Ypos+1:] concept_begin_idx = -1 concept_end_idx = -1 if question.find(beforeY) != -1: concept_begin_idx = Ypos if question.find(afterY) != -1: concept_end_idx = len(question) - len(afterY) #print("ONLY_Y") #print("CONCEPT_BEGIN_IDX:", concept_begin_idx) #print("CONCEPT_END_IDX:", concept_end_idx) if concept_begin_idx == -1 or concept_end_idx == -1: continue conceptY = question[concept_begin_idx:concept_end_idx].lower() #print("conceptY:", conceptY) for elem in self.knowledgeBase.kb: matchX = False matchY = False # X in the question: if Xpos != -1: c1 = elem["c1"] if c1.count("bn:") >= 2: pass elif "::" in c1: idx = c1.index("::") w = c1[:idx].lower() if conceptX == w: matchX = True elif "bn:" in c1: try: bn_conceptx = babelNetCache.cache[c1[c1.index("bn:"):]].lower() #print("bn_conceptx:", bn_conceptx) if conceptX == bn_conceptx: matchX = True except: pass elif c1.lower() == conceptX: matchX = True # Y in the question: if Ypos != -1: c2 = elem["c2"] if c2.count("bn:") >= 2: pass elif "::" in c2: idx = c2.index("::") w = c2[:idx].lower() if conceptY == w: matchY = True elif "bn:" in c2: try: bn_concepty = babelNetCache.cache[c2[c2.index("bn:"):]].lower() #print("bn_concepty:", bn_concepty) if conceptY == bn_concepty: matchY = True except: pass elif c2.lower() == conceptY: matchY = True if Xpos != -1 and Ypos != -1: if matchX == True and matchY == True: #print("XY - Match found with:") #print(elem) return elem["answer"] elif matchX == True or matchY == True: #print("Match found with:") #print(elem) return elem["answer"] return "I don't understand."
def run(self): if not self.fpcalc: return logging.debug("fpcalc: %s" % self.fpcalc) self.db = dbapi.connect(self.dbpath) # lastrelease = "" lastdata = [] lastquery = "" laststatus = 0 starttime = time() stoptime = starttime + 1 requests = 0 while self.running: try: path, title, artist, album = self.queue.get() except Empty as e: logging.warning(e) continue except Exception as e: logging.error(e) continue if not path or not album: logging.warning("No path/album name provided") continue if requests / (stoptime - starttime) > 3: sleep(1) starttime = stoptime logging.info("Getting infos for %s %s" % (artist, album)) fingerprint = '' duration = 0 try: logging.info("Analyzing %s file" % path) if self.fpcalc: logging.debug("fingerprint for %s" % path) fpcalc_process = subprocess.Popen( ["/usr/bin/fpcalc", path], stdout=subprocess.PIPE, stderr=subprocess.PIPE) fpcalc_output = fpcalc_process.communicate()[0].split('\n') duration = fpcalc_output[1][9:] fingerprint = fpcalc_output[2][12:] except Exception as e: logging.error(e) if fingerprint: query = u"/v2/lookup?" \ "client=8XaBELgH" \ "&meta=recording+releasegroups" \ "+tracks+puids+usermeta+compress" \ "&duration=%s&format=json&fingerprint=%s" % \ (duration, fingerprint) if query == lastquery and laststatus == 200: logging.info("Same request already occurred - skipping") try: conn = HTTPConnection("api.acoustid.org", 80) conn.request("GET", query) response = conn.getresponse() except: continue puid = "" mb_title = "" mb_artists = "" if response.status != 200: continue try: lastquery = query laststatus = 200 results = json.loads(response.read()) lastdata = results["results"][0] logging.debug(lastdata) release = "releasegroups" in lastdata \ and len(lastdata) and lastdata["releasegroups"][0] recording = "recordings" in lastdata \ and len(lastdata) and lastdata["recordings"][0] score = "score" in lastdata and lastdata['score'] logging.debug(release) logging.debug(recording) if len(lastdata): logging.debug("%s results found" % len(lastdata)) puid = 'puids' in lastdata and lastdata["puids"][0] mbid = release and release['id'] \ or recording \ and recording[0]['releasegroups'][0]['id'] mb_title = release and release["title"] \ or recording \ and recording[0]['title'] mb_artists = " ".join([ i['name'] for i in (release and release["artists"] or recording and recording[0]['artists']) ]) logging.debug("Response status: %d %s" % (response.status, response.read())) except Exception as e: continue logging.error(e) stoptime = time() requests = (requests + 1) % 3 if score < 0.7: continue if len(title) == len(mb_title): title_distance = hamming(title, mb_title) / float( len(title)) else: title_distance = levenshtein(title, mb_title) / float( max(len(title), len(mb_title))) if len(artist) == len(mb_artists): author_distance = hamming(artist, mb_artists) / float( len(artist)) else: author_distance = levenshtein(artist, mb_artists) / float( max(len(artist), len(mb_artists))) # if title_distance > 0.33 and author_distance > 0.5: logging.debug("distances: %s %s %s" % (score, title_distance, author_distance)) # continue logging.debug("puid: %s, mbid %s" % (puid, mbid)) with self.condition: try: song_id, album_id = self.db.execute( "select id, album_id from song " "where path = ?;", (path, )).fetchone() self.db.execute( "update song set puid = ?, mbid = ? " "where id = ?", (puid, mbid, song_id)) if title_distance > 0: self.db.execute( "update song set title = ? " "where id = ?", (mb_title, song_id)) self.db.commit() except Exception as e: logging.error(e) self.db.close()
def run(self): if not self.fpcalc: return logging.debug("fpcalc: %s" % self.fpcalc) self.db = dbapi.connect(self.dbpath) # lastrelease = "" lastdata = [] lastquery = "" laststatus = 0 starttime = time() stoptime = starttime + 1 requests = 0 while self.running: try: path, title, artist, album = self.queue.get() except Empty as e: logging.warning(e) continue except Exception as e: logging.error(e) continue if not path or not album: logging.warning("No path/album name provided") continue if requests / (stoptime - starttime) > 3: sleep(1) starttime = stoptime logging.info("Getting infos for %s %s" % (artist, album)) fingerprint = '' duration = 0 try: logging.info("Analyzing %s file" % path) if self.fpcalc: logging.debug("fingerprint for %s" % path) fpcalc_process = subprocess.Popen( ["/usr/bin/fpcalc", path], stdout=subprocess.PIPE, stderr=subprocess.PIPE) fpcalc_output = fpcalc_process.communicate()[0].split('\n') duration = fpcalc_output[1][9:] fingerprint = fpcalc_output[2][12:] except Exception as e: logging.error(e) if fingerprint: query = u"/v2/lookup?" \ "client=8XaBELgH" \ "&meta=recording+releasegroups" \ "+tracks+puids+usermeta+compress" \ "&duration=%s&format=json&fingerprint=%s" % \ (duration, fingerprint) if query == lastquery and laststatus == 200: logging.info("Same request already occurred - skipping") try: conn = HTTPConnection("api.acoustid.org", 80) conn.request("GET", query) response = conn.getresponse() except: continue puid = "" mb_title = "" mb_artists = "" if response.status != 200: continue try: lastquery = query laststatus = 200 results = json.loads(response.read()) lastdata = results["results"][0] logging.debug(lastdata) release = "releasegroups" in lastdata \ and len(lastdata) and lastdata["releasegroups"][0] recording = "recordings" in lastdata \ and len(lastdata) and lastdata["recordings"][0] score = "score" in lastdata and lastdata['score'] logging.debug(release) logging.debug(recording) if len(lastdata): logging.debug("%s results found" % len(lastdata)) puid = 'puids' in lastdata and lastdata["puids"][0] mbid = release and release['id'] \ or recording \ and recording[0]['releasegroups'][0]['id'] mb_title = release and release["title"] \ or recording \ and recording[0]['title'] mb_artists = " ".join( [i['name'] for i in (release and release["artists"] or recording and recording[0]['artists'])]) logging.debug( "Response status: %d %s" % (response.status, response.read())) except Exception as e: continue logging.error(e) stoptime = time() requests = (requests + 1) % 3 if score < 0.7: continue if len(title) == len(mb_title): title_distance = hamming( title, mb_title) / float(len(title)) else: title_distance = levenshtein( title, mb_title) / float( max(len(title), len(mb_title))) if len(artist) == len(mb_artists): author_distance = hamming( artist, mb_artists) / float(len(artist)) else: author_distance = levenshtein( artist, mb_artists) / float( max(len(artist), len(mb_artists))) # if title_distance > 0.33 and author_distance > 0.5: logging.debug( "distances: %s %s %s" % (score, title_distance, author_distance)) # continue logging.debug("puid: %s, mbid %s" % (puid, mbid)) with self.condition: try: song_id, album_id = self.db.execute( "select id, album_id from song " "where path = ?;", (path,)).fetchone() self.db.execute( "update song set puid = ?, mbid = ? " "where id = ?", (puid, mbid, song_id)) if title_distance > 0: self.db.execute( "update song set title = ? " "where id = ?", (mb_title, song_id)) self.db.commit() except Exception as e: logging.error(e) self.db.close()
def get_performace(dataset_name, data_set, correct_dir, incorrect_dir): confusion_mat = dict() acc = 0.0 mean_edit_distance = 0 # make outputs correct_out_path = os.path.join(correct_dir, dataset_name) incorrect_out_path = os.path.join(incorrect_dir, dataset_name) if not os.path.exists(correct_out_path): os.mkdir(correct_out_path) if not os.path.exists(incorrect_out_path): os.mkdir(incorrect_out_path) correct_results_out = open(os.path.join(correct_out_path, dataset_name + '.results'), 'w') incorrect_results_out = open(os.path.join(incorrect_out_path, dataset_name + '.results'), 'w') file_idx = 0 num_processed_data = 0 data_loader = torch.utils.data.DataLoader( data_set, batch_size=128, shuffle=False, num_workers=0, collate_fn=data_set.collate_fn) with torch.set_grad_enabled(False): for batch_idx, (inputs, targets, synths, lengths, imgpaths) in enumerate(data_loader): num_processed_data += inputs.size(0) sys.stdout.write('\r' + str(dataset_name) + ': ' + str(num_processed_data) + '/' + str(len(data_set))) device_inputs = inputs.to(device) preds = crnn(device_inputs) preds_steps = torch.tensor([preds.size(0)] * preds.size(1), dtype=torch.int32) values, indices = preds.max(2) indices = indices.transpose(1, 0).contiguous().view(-1) blank_targets = torch.empty(targets.size(0) * 2, dtype=torch.int32) for idx in range(targets.size(0)): blank_targets[2 * idx] = targets[idx] blank_targets[2 * idx + 1] = 0 for idx in range(lengths.size(0)): step = lengths[idx].item() lengths[idx] = step * 2 target_texts = encoder.decode(blank_targets, lengths) pred_texts = encoder.decode(indices, preds_steps) pred_synths = generator_g(crnn.encoder(device_inputs)) pred_synths = pred_synths.cpu() for idx in range(len(target_texts)): # classification lower_pred_text = pred_texts[idx].lower() lower_target_text = target_texts[idx].lower() synth = pred_synths[idx] synth = synth.numpy() synth = synth * 255 synth = np.transpose(synth, (1, 2, 0)) synth = synth.astype(np.uint8) if lower_pred_text == lower_target_text: acc += 1.0 # Each dataset str_idx = utils.idx_to_str(file_idx) tmp_str = os.path.join(correct_out_path, str_idx + '.png') tmp_img = inputs[idx].numpy() tmp_img = tmp_img * 255 tmp_img = np.transpose(tmp_img, (1, 2, 0)) cv2.imwrite(tmp_str, tmp_img) tmp_str = os.path.join(correct_out_path, str_idx + '.jpg') cv2.imwrite(tmp_str, synth) correct_results_out.write(str_idx + '\t' + imgpaths[idx] + '\t' + target_texts[idx] + '\t' + pred_texts[idx] + '\n') else: edit_distance = utils.levenshtein(lower_pred_text, lower_target_text) mean_edit_distance += edit_distance # Each dataset str_idx = utils.idx_to_str(file_idx) tmp_str = os.path.join(incorrect_out_path, str_idx + '.png') tmp_img = inputs[idx].numpy() tmp_img = tmp_img * 255 tmp_img = np.transpose(tmp_img, (1, 2, 0)) cv2.imwrite(tmp_str, tmp_img) tmp_str = os.path.join(incorrect_out_path, str_idx + '.jpg') cv2.imwrite(tmp_str, synth) incorrect_results_out.write(str_idx + '\t' + imgpaths[idx] + '\t' + target_texts[idx] + '\t' + pred_texts[idx] + '\n') file_idx += 1 confusion_mat = utils.get_confusion_matrix(preds=pred_texts, targets=target_texts, confusion_dict=confusion_mat) correct_results_out.close() incorrect_results_out.close() acc /= float(num_processed_data) mean_edit_distance /= float(num_processed_data) print("") print("num. of data: " + str(num_processed_data)) return acc, mean_edit_distance, confusion_mat
def validation(epoch, network, batchSize, set_name, Set, imageHeight, imageWidth, labels, num_classes, log_indicator, models_path, valid_writer, AACHEN_init=False, AACHEN_h5_file=[], dataAugmentation=False): nameList, inputs, targetList, seqLengths, heights, transcriptionList, transcriptionsLenList = Set SetSize = len(nameList) n_batches = ceil(SetSize / batchSize) nameList_copy, inputs_copy, targetList_copy, seqLengths_copy, heights_copy, transcriptionList_copy, transcriptionsLenList_copy = list( nameList), list(inputs), list(targetList), list(seqLengths), list( heights), list(transcriptionList), list(transcriptionsLenList) if dataAugmentation: inputs_copy = pack_images(inputs_copy, imageHeight, imageWidth) setTotalChars = np.sum(transcriptionsLenList) EDnorm = 0 EDabs = 0 totalCost = 0 graph, saver, inputs_mask_ph, seq_len_ph, targets_ph, targets_len_ph, learning_rate_ph, n_batches_ph, setTotalChars_ph, previousEDabs_ph, previousEDnorm_ph, previousCost_ph, optimizer, batch_cost, cost, errors, ED, predictions, merged = network.create( imageHeight, imageWidth, num_classes, True) if type(inputs_mask_ph) == list: mask_ph = inputs_mask_ph[1] inputs_ph = inputs_mask_ph[0] else: inputs_ph = inputs_mask_ph if type(saver) == list: saver = saver[0] with tf.Session(graph=graph) as sess: if AACHEN_init: text = ('\nInitializing weights from AACHEN framework\n') print(text) log_indicator.write(text) init = tf.global_variables_initializer() feed_dict = initialize_from_AACHEN(graph, AACHEN_h5_file, log_indicator) sess.run(init, feed_dict=feed_dict) else: saver.restore(sess=sess, save_path=tf.train.latest_checkpoint(models_path)) valid_start = time.time() prev_percent = -1 text = '\n' * 4 + "Muestras epoch " + str( epoch) + " in " + set_name + " set.\n" print(text) log_indicator.write(text) word_errors = 0 num_words = 0 for batch in range(n_batches): BatchNameList, BatchInputs, BatchTargetSparse, BatchSeqLengths, BatchHeights, BatchTranscriptions, BatchTransLen = get_batch( batchSize, nameList_copy, inputs_copy, targetList_copy, seqLengths_copy, heights_copy, transcriptionList_copy, transcriptionsLenList_copy) feed = { inputs_ph: BatchInputs, targets_ph: BatchTargetSparse, targets_len_ph: BatchTransLen, seq_len_ph: BatchSeqLengths, n_batches_ph: n_batches, setTotalChars_ph: setTotalChars, previousEDabs_ph: EDabs, previousEDnorm_ph: EDnorm, previousCost_ph: totalCost } if type(inputs_mask_ph) == list: mask = np.zeros( [len(BatchNameList), imageHeight, imageWidth, 1]) for img in range(len(BatchNameList)): mask[img, :BatchHeights[img], :BatchSeqLengths[img], 0] = np.ones( [BatchHeights[img], BatchSeqLengths[img]]) feed[mask_ph] = mask summary, batchCost, totalCost, [ EDnorm, EDabs ], BatchOutpusSparse, errors_output = sess.run( [merged, batch_cost, cost, ED, predictions[0], errors], feed) BatchOutput = sess.run( tf.sparse_tensor_to_dense(tf.SparseTensor( BatchOutpusSparse.indices, BatchOutpusSparse.values, BatchOutpusSparse.dense_shape), default_value=num_classes)) labels[num_classes] = ' ' for ind in range(len(BatchNameList)): obtained_transcription = ' '.join( list(map(labels.get, list(BatchOutput[ind])))).strip() text = str('| Name:').ljust(10) + str( BatchNameList[ind] ).rjust(15) + ' | ' + str("Target:").ljust(10) + ''.join( BatchTranscriptions[ind]).rjust(100) + " |\n" + str( '| Errors: ').ljust(10) + str( errors_output[ind]).rjust(15) + ' | ' + str( "Output:").ljust(10) + str( obtained_transcription).rjust( 100) + ' |\n' + '-' * 88 + '\n' print(text) log_indicator.write(text) word_errors += levenshtein( ''.join(BatchTranscriptions[ind].split()).split('|'), ''.join(obtained_transcription.split()).split('|')) num_words += len(''.join( BatchTranscriptions[ind].split()).split('|')) batch_end = time.time() time_elapsed = floor(1000 * (batch_end - valid_start)) / 1000 prev_percent = floor(10000 * (batch + 1) / n_batches) / 100 remaining_time = floor( 1000 * (100 * (time_elapsed + eps) / (prev_percent + eps) - time_elapsed)) / 1000 print('Epoch ' + str(epoch) + '. Evaluated ' + str(len(BatchNameList)) + ' sequences in batch ' + str(batch + 1) + '/' + str(n_batches) + '. Cost Function: ' + str(batchCost) + '.\nTime elapsed: ' + seconds_to_days_hours_min_sec(time_elapsed) + '. Remaining time: ' + seconds_to_days_hours_min_sec(remaining_time) + '\n') print('[' + int(prev_percent) * '|' + (100 - int(prev_percent)) * ' ' + '] ' + str(prev_percent) + '%\n') WER = word_errors / num_words valid_writer.add_summary(summary, epoch) print_valid_results(epoch, set_name, SetSize, totalCost, [EDnorm, EDabs], WER, log_indicator)
batch_size = int(true_len.shape[0]) output = model(img) #[w, bs, 1782] torch.cuda.empty_cache() seq_len = torch.tensor([output.shape[0]] * output.shape[1]) loss = criterion(log_probs=output, targets=targets, input_lengths=seq_len, target_lengths=true_len) valid_loss += loss.cpu().item() output = output.detach().permute( 1, 0, 2) #[bs, seq_len, |vocs|+1(blank)] decoded, max_probs = decoder.decode(output, true_len) for i in range(targets.shape[0]): target = targets[i][:true_len[i]].cpu().numpy().tolist() distance = levenshtein(target, decoded[i]) target = ' '.join(list(map(str, target))) # target = decoder.convert_np_to_string(target) val_edit += distance val_len += true_len[i].cpu().item() if idx == 0: dist_list.append(distance) target_list.append(target) # writer.add_text('target', target_list[0]) pred = ' '.join(list(map(str, decoded[0]))) writer.add_text( 'Result', 'Decode: {} \n Target: {}'.format(pred, target_list[0]), epoch) # print('targets:',target_list[i]) # print('decoded:',decoded[i])
bashCommand = "/exp/sw/kaldi/tools/sctk/bin/rover -f 1 -a 0 -c {} -h {} ctm -h {} ctm -o {} -m maxconf".format( conf, service_ctm_path, ds2_ctm_path, output_ctm_path) process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE) output, error = process.communicate() fname2transcript = get_transcripts(output_ctm_path) dump_dir = '../dumps/decode/{}/char/'.format(accent) with open(join(dump_dir, 'ref_wrds_{}.txt'.format(part))) as fd: refs = fd.read().splitlines() total_wer = 0.0 total_tokens = 0 for ref, fname in zip(refs, valid_fnames): hyp = fname2transcript[fname] total_wer += levenshtein(ref.split(), hyp.split()) total_tokens += len(ref.split()) curr_wer = total_wer / total_tokens if curr_wer < best_wer: best_wer = curr_wer best_conf = conf print('Best NULL Conf: {}, WER: {}'.format(best_conf, best_wer)) else: output_ctm_path = join(rover_dir, 'out_{}.ctm'.format(part)) bashCommand = "/exp/sw/kaldi/tools/sctk/bin/rover -f 1 -a 0 -c {} -h {} ctm -h {} ctm -o {} -m maxconf".format( args.conf, service_ctm_path, ds2_ctm_path, output_ctm_path) process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE)
gt = line.strip() if gt in train_vocab: continue if opt.lower: gt = gt.lower() pred = pred.lower() if opt.alnum: pattern = re.compile('[\W_]+') gt = pattern.sub('', gt) pred = pattern.sub('', pred) # pdb.set_trace() # gt = # print('before') if gt != pred: ww += 1 wc += levenshtein(gt, pred) word_lens.append(len(gt)) print(gt, pred, wc) tc += len(gt) tw += 1 else: for i, line in enumerate(f): if i % 2 == 0: pred = line.strip() else: gt = line.strip() gt = clean(gt) pred = clean(pred) gt_w = gt.split() pred_w = pred.split() for j in range(len(gt_w)):