Beispiel #1
0
 async def choose(self, ctx, *, choices=""):
     """Choose between one of various supplied things.
     
     Syntax:
     
     * choose x, y, z - Choose between x, y, and z.
     """
     choices = choices.split(",")
     if len(choices) <= 1:
         raise commands.UserInputError(("Not enough choices! "
                                        "Separate choices with commas, e.g. "
                                        "`choose A cat, A bear, A python`"))
     # Eliminate leading and trailing whitespace.
     for index in range(0, len(choices)):
         choices[index] = choices[index].strip()
     # Are they all the same?
     if len(set(choices)) == 1:
         raise commands.UserInputError("They're all the same, I can't choose!")
     choice = None
     # Loaded choice. The program biases in favor of pythons.
     for distance in range(0, 3):
         for choice_loaded in choices:
             if utils.levenshtein("python", choice_loaded.lower()) == distance:
                 python = (f"{choice_loaded}, obviously",
                           f"{choice_loaded}, duh",
                           choice_loaded)
                 choice = systemrandom.choice(python)
                 break
         if choice:
             break
     # Couldn't find a python, so now the program actually choses randomly.
     if not choice:
         choice = systemrandom.choice(choices)
     logger.info(f"Chose {choice}")
     await ctx.send(choice)
Beispiel #2
0
def dl_fna(species_name):
    """Dl fna if necessary, return filename"""
    accession = dl_gbk(species_name)
    print "accession:",accession
    fna_name = accession + ".fna"
    print "fna_name:",fna_name
    target_path = os.path.join("data",species_name,fna_name)
    if os.path.isfile(target_path):
        print "found fna:",target_path
        return target_path
    print "didn't find fna for:",species_name,"downloading"
    host.chdir('/genomes/Bacteria/')
    dir_list = host.listdir(host.curdir)
    sorted_dir_list = sorted(dir_list,key=lambda fname:levenshtein(species_name,fname))
    for dir_name in sorted_dir_list:
        print "trying:",dir_name
        try:
            host.chdir('/genomes/Bacteria/' + dir_name + '/')
            sub_dir_list = host.listdir(host.curdir)
            if find(lambda name:name.startswith(accession),sub_dir_list):
                host.download(fna_name,target_path)
                return target_path
        except:
            continue
    print "Couldn't find fna for:",species_name
    return None
Beispiel #3
0
    def correct(self, string):
        if (len(string) - self.max_length) > self.max_distance:
            return []

        corrections_dict = {}
        min_correct_len = float("inf")
        queue = sorted(
            list(
                set([string] +
                    utils.generate_deletes(string, self.max_distance))),
            key=len,
            reverse=True,
        )

        while len(queue) > 0:
            q_item = queue.pop(0)

            if (len(corrections_dict) > 0) and (
                (len(string) - len(q_item)) > min_correct_len):
                break

            if (q_item in self.dictionary) and (q_item
                                                not in corrections_dict):
                if self.dictionary[q_item][1] > 0:
                    corrections_dict[q_item] = (
                        self.dictionary[q_item][1],
                        len(string) - len(q_item),
                    )
                    if len(string) == len(q_item):
                        break

                    elif (len(string) - len(q_item)) < min_correct_len:
                        min_correct_len = len(string) - len(q_item)

                for sc_item in self.dictionary[q_item][0]:
                    if sc_item not in corrections_dict:
                        if len(q_item) == len(string):
                            item_dist = len(sc_item) - len(q_item)

                        item_dist = utils.levenshtein(sc_item, string)

                        if item_dist > min_correct_len:
                            pass

                        elif item_dist <= self.max_distance:
                            corrections_dict[sc_item] = (
                                self.dictionary[sc_item][1],
                                item_dist,
                            )
                            if item_dist < min_correct_len:
                                min_correct_len = item_dist

                        corrections_dict = {
                            k: v
                            for k, v in corrections_dict.items()
                            if v[1] <= min_correct_len
                        }

        return corrections_dict
Beispiel #4
0
def check_vendor(vendor_name, vendor_email) -> Optional[str]:
  for vr in existing_vendor_list:
    vn = vr["vendor_name"]
    if vn not in vr["alias"]:
      vr["alias"].append(vn)
    for v in vr["alias"]:
      if levenshtein(vendor_name, v, ignore_case=True) < 2:
        return vn
  return
Beispiel #5
0
def get_vendor_record(vendor_name, vendor_email) -> Optional[dict]:
  for vr in existing_vendor_list:
    vn = vr["vendor_name"]
    if vn not in vr["alias"]:
      vr["alias"].append(vn)
    for v in vr["alias"]:
      if levenshtein(vendor_name, v, ignore_case=True) < 2:
        return vr
  return
Beispiel #6
0
def get_metropolitan_index(cname):
	cname = cname.lower()
	for suffix in ("utara", "selatan", "timor", "barat"):
		cname = cname.replace(' ' + suffix, '')
	results = sorted([(levenshtein(name, cname), name, index) for name, population, index in cities_and_towns])
	if results[0][0] < 2:
		return max(results[0][2], 2), results[0][1]
	else:
		return 2, ''
Beispiel #7
0
    def eval(self, data, max_iter=np.inf):
        data_loader = torch.utils.data.DataLoader(data, batch_size=self.opt.batchSize,
                                                num_workers=int(self.opt.workers),
                                                pin_memory=True,
                                                collate_fn=dataset.collatedict())
        self.model.eval()
        gts = []
        decoded_preds = []
        val_iter = iter(data_loader)
        tc = 0
        wc = 0
        ww = 0
        tw = 0
        loss_avg = utils.averager()
        max_iter = min(max_iter, len(data_loader))
        with torch.no_grad():
            # print('-------Current LR-----')
            # for param_group in self.optimizer.param_groups:
            #     print(param_group['lr'])
            # print('---------------------')
            for i in range(max_iter):
                if self.opt.mode == 'test':
                    print('%d / %d' % (i, len(data_loader)), end='\r')
                output_dict = self.forward_sample(val_iter.next())
                batch_size = output_dict['batch_size']
                preds = F.log_softmax(output_dict['probs'], 2)
                preds_size = Variable(torch.IntTensor([preds.size(0)] * batch_size))
                cost = self.get_loss({'preds': preds, 'batch_size': batch_size,
                                      'preds_size': preds_size, 'params':output_dict['params']})
                loss_avg.add(cost)
                decoded_pred = self.decoder(preds, preds_size)
                gts += list(output_dict['gt'])
                decoded_preds += list(decoded_pred)

        if self.mode == "train":
            pcounter = 0
            for target, pred in zip(gts, decoded_preds):
                if pcounter < 5:
                    print('Gt:   ', target)
                    print('Pred: ', pred)
                    pcounter += 1
                if target!=pred:
                    ww += 1
                tw += 1
                wc += utils.levenshtein(target, pred)
                tc += len(target)
            wer = (ww / tw)*100
            cer = (wc / tc)*100
            return loss_avg, cer, wer
        else:
            f = open(self.opt.out, 'w')
            for target, pred in zip(gts, decoded_preds):
                f.write('{}\n{}\n'.format(pred, target))
            f.close()
            print('Generated predictions for {} samples'.format(self.test_data.nSamples))
        return
Beispiel #8
0
def get_wer(refs, hyps):

    assert len(refs) == len(hyps)
    total_wer = 0.0
    total_tokens = 0
    for ref, hyp in zip(refs, hyps):
        total_wer += levenshtein(ref.split(), hyp.split())
        total_tokens += len(ref.split())

    return total_wer / total_tokens
Beispiel #9
0
def soundmatch(sounda, soundb, maxpoints):
    """Given two sounds (suffixes or prefixes), calculate a score for their
    similarity."""

    ## if they're both not zero
    if len(sounda) == 0 and len(soundb) == 0:
        return 0

    frac = utils.levenshtein(sounda, soundb) / max(len(sounda), len(soundb))
    return maxpoints * (1 - frac)
Beispiel #10
0
def soundmatch(sounda, soundb, maxpoints):
    """Given two sounds (suffixes or prefixes), calculate a score for their
    similarity."""

    ## if they're both not zero
    if len(sounda) == 0 and len(soundb) == 0:
        return 0

    frac = utils.levenshtein(sounda, soundb) / max(len(sounda), len(soundb))
    return maxpoints * (1 - frac)
Beispiel #11
0
def expand(sentence):
    sen, abbr, and_pos = clean_abbr(sentence)
    expand = ''
    if len(dic[abbr]) == 0:
        return "null", -1
    if len(dic[abbr]) == 1:
        expand = dic[abbr][0]
        expand = insert_va(expand, and_pos)
        return expand, 0
    if len(dic[abbr]) >= 2:
        pred, score, time = evaluate(sen)
        tmp = len(pred)
        for item in dic[abbr]:
            if levenshtein(item, pred) < tmp:
                expand = item
                tmp = levenshtein(item, pred)
        if tmp > 2:
            expand = "null"
        expand = insert_va(expand, and_pos)
        return expand, score.item()
Beispiel #12
0
def get_emoji(ctx, expression: str):
    """Doesn't really work that well."""
    bot = ctx.bot
    try:
        return bot.get_emoji(int(expression))
    except Exception:
        pass
    expression = expression.strip(":").lower()
    for closeness in range(0, 4):
        for emoji in bot.emojis:
            if utils.levenshtein(expression, emoji.name.lower()) == closeness:
                return emoji
Beispiel #13
0
def spell_correction(misspelled_word: str):
    my_dict = mapping(soundex(misspelled_word))
    for k in my_dict.keys():
        my_dict[k] = levenshtein(misspelled_word, k)

    sorted_values = sorted(my_dict.values())
    out = ''

    for i in my_dict.keys():
        if (my_dict[i] == sorted_values[0]):
            out = f'{out},{i}'

    return (out[1:])
def printSearch(search_category, search_package, maxDist=2):
    installable = portage.PortageInstance.getInstallables()
    similar = []
    match = None
    package_re = re.compile(".*%s.*" % search_package, re.IGNORECASE)
    for _p in installable:
        if search_category == "" or search_category == _p.category:
            package = portage.PortageInstance.getPackageInstance(
                _p.category, _p.package)
            if not package:
                continue
            levDist = utils.levenshtein(search_package.lower(),
                                        package.package.lower())
            if levDist == 0:
                match = (levDist, package)
                break
            elif package_re.match(package.package):
                similar.append((levDist - maxDist, package))
            elif len(package.package) > maxDist and levDist <= maxDist:
                similar.append((levDist, package))
            else:
                if package_re.match(package.subinfo.shortDescription):
                    similar.append((100, package))

    if match == None:
        if len(similar) > 0:
            print("Emerge was unable to find %s, similar packages are:" %
                  search_package)
            similar.sort(key=lambda x: x[0])
        else:
            print("Emerge was unable to find %s" % search_package)
    else:
        print("Package %s found:" % search_package)
        similar = [match]

    for levDist, package in similar:
        EmergeDebug.debug((package, levDist), 1)
        print(package)
        print("\t Homepage: %s" % package.subinfo.homepage)
        print("\t Description: %s" % package.subinfo.shortDescription)
        print("\t Latest version: %s" % package.subinfo.defaultTarget)
        installed = False
        for pack in InstallDB.installdb.getInstalledPackages(
                package.category, package.package):
            if pack.getVersion():
                installed = True
                print("\t Installed versions: %s" % pack.getVersion())
            if pack.getRevision():
                print("\t Installed revision: %s" % pack.getRevision())
        if not installed:
            print("\t Installed versions: None")
Beispiel #15
0
def keywords(text, n=15):
    """
    extract most relevant keywords from given text
    steps:    
    1. tokenize text by words
    2. applying synctatic filter
    3. compute pairwise levenshtein distance
    4. create graph based on cosine distance matrix
    5. compute pagerank
    
    - text: string consisting of a few sentences
    - n: number of keywords to extract
    """
    import nltk
    synctatic_filter = ['NN', 'JJ']

    # tokenizung by words
    words = word_tokenize(text)

    # pos-tagging
    tagged = nltk.pos_tag(words)

    #applying synctatic filter
    filtered = [i[0].lower() for i in tagged if i[1] in synctatic_filter]

    # pairwise combinations
    pairs = list(combinations(filtered, 2))

    # compute distance between every pair and set it as weight of graph edge
    weighted_edges = []

    for i in range(len(pairs)):
        # distance define as weight of edge
        weight = utils.levenshtein(pairs[i][0], pairs[i][1])
        weighted_edges.append((pairs[i][0], pairs[i][1], weight))

    # create graph
    G = nx.Graph()
    G.add_weighted_edges_from(weighted_edges)

    # calculate pagerank
    pr = nx.pagerank(G, alpha=0.85)

    # dict of TextRank ranking of levenshtein distance matrix
    ranking = Counter(pr)

    # top n keywords
    keywords, scores = list(zip(*ranking.most_common(n)))

    return keywords, scores
Beispiel #16
0
 def calcsimilarity(known, table, id1, id2, comparison):
     tokens = re.split("[/,;]", comparison[1])
     ret = False
     for j in xrange(0, len(known)):
         if i == j:
             continue
         if comparison[2] == known[j][2]:
             similarity = 1.0
         else:
             compared_genre = re.split("[/,;]", known[j][1])
             distance = {}
             sametags = 0
             for a in tokens:
                 if not a:
                     continue
                 for b in compared_genre:
                     if not b or b in distance:
                         continue
                     if len(a) == len(b):
                         h = hamming(a, b) / float(len(a))
                         if h:
                             distance[b] = h
                         else:
                             sametags = sametags + 1
                     else:
                         distance[b] = levenshtein(a, b) /  \
                             float(max(len(a), len(b)))
             if distance:
                 # geometric mean + weighted equal tags
                 similarity = 1.0 - (
                     reduce(lambda x, y: x * y, distance.values())) ** \
                     (1.0 / len(distance)) + \
                     (sametags / (sametags + len(distance)))
             else:
                 similarity = 0.0
         if similarity > 0.33:
             if not db.execute(
                     "select * from %s  where %s  = ? and %s = ?" %
                 (table, id1, id2),
                 (comparison[0], known[j][0])).fetchall():
                 db.execute(
                     "insert or ignore into %s "
                     "(%s, %s, similarity) values ( ?, ?, ?)" %
                     (table, id1, id2),
                     (comparison[0], known[j][0], similarity))
                 ret = True
     return ret
Beispiel #17
0
def printSearch(search_category, search_package,maxDist = 2):
        installable = portage.PortageInstance.getInstallables()
        similar = []
        match = None
        package_re = re.compile(".*%s.*" % search_package, re.IGNORECASE)
        for _p in installable:
            if search_category == "" or search_category == _p.category:
                package = portage.PortageInstance.getPackageInstance( _p.category, _p.package)
                if not package:
                    continue
                levDist = utils.levenshtein(search_package.lower(),package.package.lower())
                if levDist == 0 :
                    match = (levDist,package)
                    break
                elif package_re.match(package.package):
                    similar.append((levDist-maxDist,package))
                elif len(package.package)>maxDist and levDist <= maxDist:
                    similar.append((levDist,package))
                else:
                    if package_re.match(package.subinfo.shortDescription):
                        similar.append((100,package))

        if match == None:
            if len(similar)>0:
                print("Emerge was unable to find %s, similar packages are:" % search_package) 
                similar.sort( key = lambda x: x[0])
            else:
                print("Emerge was unable to find %s" % search_package)
        else:
            print("Package %s found:" % search_package)
            similar = [match]
        
        for levDist,package in similar:
            EmergeDebug.debug((package, levDist), 1)
            print(package)
            print("\t Homepage: %s" % package.subinfo.homepage)
            print("\t Description: %s" % package.subinfo.shortDescription)
            print("\t Latest version: %s" % package.subinfo.defaultTarget)
            installed = False
            for pack in InstallDB.installdb.getInstalledPackages(package.category,package.package):
                if pack.getVersion():
                    installed = True
                    print("\t Installed versions: %s" % pack.getVersion())
                if pack.getRevision():
                    print("\t Installed revision: %s" % pack.getRevision())
            if not installed:
                print("\t Installed versions: None")
Beispiel #18
0
 def calcsimilarity(known, table, id1, id2, comparison):
     tokens = re.split("[/,;]", comparison[1])
     ret = False
     for j in xrange(0, len(known)):
         if i == j:
             continue
         if comparison[2] == known[j][2]:
             similarity = 1.0
         else:
             compared_genre = re.split("[/,;]", known[j][1])
             distance = {}
             sametags = 0
             for a in tokens:
                 if not a:
                     continue
                 for b in compared_genre:
                     if not b or b in distance:
                         continue
                     if len(a) == len(b):
                         h = hamming(a, b) / float(len(a))
                         if h:
                             distance[b] = h
                         else:
                             sametags = sametags + 1
                     else:
                         distance[b] = levenshtein(a, b) /  \
                             float(max(len(a), len(b)))
             if distance:
                 # geometric mean + weighted equal tags
                 similarity = 1.0 - (
                     reduce(lambda x, y: x * y, distance.values())) ** \
                     (1.0 / len(distance)) + \
                     (sametags / (sametags + len(distance)))
             else:
                 similarity = 0.0
         if similarity > 0.33:
             if not db.execute(
                 "select * from %s  where %s  = ? and %s = ?" %
                 (table, id1, id2),
                     (comparison[0], known[j][0])).fetchall():
                 db.execute(
                     "insert or ignore into %s "
                     "(%s, %s, similarity) values ( ?, ?, ?)" %
                     (table, id1, id2),
                     (comparison[0], known[j][0], similarity))
                 ret = True
     return ret
Beispiel #19
0
def printSearch(search_category, search_package,maxDist = 2):
        installable = portage.PortageInstance.getInstallables()
        similar = []
        match = None
        package_re = re.compile(".*%s.*" % search_package.lower())
        for category,package,version in installable:
            if search_category == "" or search_category == category:
                meta = portage.PortageInstance.getMetaData( category, package, version )
                levDist = utils.levenshtein(search_package.lower(),package.lower())
                if levDist == 0 :
                    match = (levDist,category,package,version,meta)
                    break;
                elif package_re.match(package.lower()):
                    similar.append((levDist-maxDist,category,package,version,meta))
                elif len(package)>maxDist and levDist <= maxDist:
                    similar.append((levDist,category,package,version,meta))
                else:
                    if "shortDescription" in meta:
                        if package_re.match(meta["shortDescription"].lower()):                        
                            similar.append((100,category,package,version,meta))
                
        if match == None:
            if len(similar)>0:
                print("Emerge was unable to find %s, similar packages are:" % search_package) 
                similar.sort()
            else:
                print("Emerge was unable to find %s" % search_package)
        else:
            print("Package %s found:" % search_package)
            similar = [match]
        
        for levDist,category,package,version,meta in similar:
            utils.debug((category,package,version,levDist),1)
            description = ""
            if "shortDescription" in meta:
                description = meta["shortDescription"]
            homepage = ""
            if "homepage" in meta:
                homepage = meta["homepage"]
            #print(levDist)
            print("%s/%s" % (category,package))
            print("\t Homepage: %s" % homepage)
            print("\t Description: %s" % description)
            print("\t Latest version: %s" % version)
            print("\t Installed version: %s" % InstallDB.installdb.findInstalled(category,package))
Beispiel #20
0
 def test(self):
     with self.sess.as_default():
         example_count = 0
         total_error = 0
         batch_x, batch_y, batch_length = self.data.get_next_test_batch(
             self.batch_size)
         data_targets = sparse_tuple_from(batch_y)
         predict_str = self.sess.run([self.decoded],
                                     feed_dict={
                                         self.inputs: batch_x,
                                         self.seq_len: batch_length
                                     })
         example_count += len(batch_y)
         total_error += np.sum(
             levenshtein(ground_truth_to_word(batch_y),
                         ground_truth_to_word(decoded)))
         print "Error on test set: {}".format(total_error / example_count)
     return None
    def getByDistance(self, name, tolerance = 10):
        """Returns version if there is a version within Levenshtein distance of
        'tolerance' parameter for 'name' parameter. Nearest version is returned.
        Comparison is done case-insensitively. First appearing in versions.xml
        is chosen on tie. None is returned if no version is in given distance.
        """
        nearest = None
        minDistance = 999

        for version in self.versions:
            n1 = version.name.lower()
            n2 = name.lower()
            l_distance = levenshtein(n1, n2)

            if l_distance < tolerance and l_distance < minDistance:
                minDistance = l_distance
                nearest = version
        return nearest
Beispiel #22
0
    def getByDistance(self, name, tolerance=10):
        """Returns version if there is a version within Levenshtein distance of
        'tolerance' parameter for 'name' parameter. Nearest version is returned.
        Comparison is done case-insensitively. First appearing in versions.xml
        is chosen on tie. None is returned if no version is in given distance.
        """
        nearest = None
        minDistance = 999

        for version in self.versions:
            n1 = version.name.lower()
            n2 = name.lower()
            l_distance = levenshtein(n1, n2)

            if l_distance < tolerance and l_distance < minDistance:
                minDistance = l_distance
                nearest = version
        return nearest
Beispiel #23
0
def printSearch(search_package, maxDist=2):
    searchPackageLower = search_package.lower()
    isPath = "/" in searchPackageLower
    with CraftTimer.Timer("Search", 0) as timer:
        similar = []
        match = None
        package_re = re.compile(f".*{search_package}.*", re.IGNORECASE)
        for searchPackage in packages():
            packageString = searchPackage.path if isPath else searchPackage.name
            levDist = abs(len(searchPackageLower) - len(packageString))
            if levDist <= maxDist:
                levDist = utils.levenshtein(searchPackageLower,
                                            packageString.lower())
            if levDist == 0:
                match = (levDist, searchPackage)
                break
            elif package_re.match(searchPackage.path):
                similar.append((levDist - maxDist, searchPackage))
            elif len(packageString) > maxDist and levDist <= maxDist:
                similar.append((levDist, searchPackage))
            else:
                if package_re.match(searchPackage.description) or \
                        package_re.match(searchPackage.tags):
                    similar.append((100, searchPackage))

        if match is None:
            if len(similar) > 0:
                CraftCore.log.info(
                    f"Craft was unable to find {search_package}, similar packages are:"
                )
                similar.sort(key=lambda x: x[0])
            else:
                CraftCore.log.info(
                    f"Craft was unable to find {search_package}")
        else:
            CraftCore.log.info(f"Package {search_package} found:")
            similar = [match]

        for levDist, searchPackage in similar:
            CraftCore.log.debug((vars(searchPackage), levDist))
            CraftCore.log.info(searchPackage)
Beispiel #24
0
def pois_v1():
    global _db

    filter_s = unicode(request.query.get('filter', None), encoding='utf-8')
    if filter_s is None:
        abort(501, "Unfiltered searches not allowed.")

    result = search(database=_db, verbose=False, query=filter_s)

    municipality = request.query.get('municipality', None)
    if municipality:
        if municipality.lower() in municipalities_set:
            municipality_key = None
            for k, v in municipalities.iteritems():
                if v.lower() == municipality.lower():
                    municipality_key = k
                    break
            if not municipality_key:
                abort(501, "Unknown municipality: %s." % municipality)
            else:
                result = [r for r in result if r['municipality_id'] == municipality_key]


    try:
        result_count = int(request.query.get('resultcount', -1))

        if int(result_count) != -1:
            for r in result:
                distance = levenshtein(filter_s, r['name'])
                r['edit_distance'] = distance

            result.sort(key=lambda x: x['edit_distance'])
            result = list(islice(result, result_count))
    except:
        abort(501, "Cannot parse resultcount:%s." % request.query.get('resultcount'))

    response.content_type = 'application/json'
    return json.dumps(result, ensure_ascii=False)
Beispiel #25
0
 def test(self):
     with self.__session.as_default():
         print('Testing')
         total_error = 0
         example_count = 0
         for batch_y, batch_sl, batch_x in self.__data_manager.get_next_test_batch(
         ):
             data_targets = np.asarray([
                 label_to_array(lbl, config.CHAR_VECTOR) for lbl in batch_y
             ])
             data_targets = sparse_tuple_from(data_targets)
             decoded = self.__session.run([self.__decoded],
                                          feed_dict={
                                              self.__inputs: batch_x,
                                              self.__seq_len: batch_sl
                                          })
             example_count += len(batch_y)
             total_error += np.sum(
                 levenshtein(ground_truth_to_word(batch_y),
                             ground_truth_to_word(decoded)))
         print('Error on test set: {}'.format(total_error,
                                              total_error / example_count))
     return None
Beispiel #26
0
def main():

    model = '../models/output_graph.pbmm'
    alphabet = '../models/alphabet.txt'
    lm = '../models/lm.binary'
    trie = '../models/trie'

    samples = 200

    # '/Volumes/Seagate/Dataset/Coffee Shop/snr0/LibriSpeech/test_clean/wav'
    snr = 20

    audiofolder = '/Volumes/Seagate/Dataset/Coffee Shop/snr' + str(
        snr) + '/librispeech_orig_cropped/test_clean/wav'
    transcription_folder = '/Users/shibozhang/Documents/Course/DeepLearningTopics_496/dataset/Coffee Shop/snr' + str(
        snr) + '/LibriSpeech/test_clean/transcripts/'

    reference_folder = '/Users/shibozhang/Documents/Course/DeepLearningTopics_496/dataset/LibriSpeech_dataset/raw/test_clean/txt/'
    result_file = '/Users/shibozhang/Documents/Course/DeepLearningTopics_496/dataset/results.txt'

    if not os.path.exists(transcription_folder):
        os.makedirs(transcription_folder)

    audio_files = list_files_in_directory(audiofolder)
    print('number of audio clips: ', str(len(audio_files)))
    random.shuffle(audio_files)
    audio_files = audio_files[0:samples]

    audio_list = [os.path.join(audiofolder, i) for i in audio_files]
    savefiles = [
        os.path.join(transcription_folder, i[:-4] + '.txt')
        for i in audio_files
    ]

    print('Loading model from file {}'.format(model), file=sys.stderr)
    model_load_start = timer()
    ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)

    if lm and trie:
        print('Loading language model from files {} {}'.format(lm, trie),
              file=sys.stderr)
        lm_load_start = timer()
        ds.enableDecoderWithLM(alphabet, lm, trie, LM_ALPHA, LM_BETA)
        lm_load_end = timer() - lm_load_start
        print('Loaded language model in {:.3}s.'.format(lm_load_end),
              file=sys.stderr)

    for audio, savefile in zip(audio_list, savefiles):
        fin = wave.open(audio, 'rb')
        fs = fin.getframerate()
        if fs != 16000:
            print(
                'Warning: original sample rate ({}) is different than 16kHz. Resampling might produce erratic speech recognition.'
                .format(fs),
                file=sys.stderr)
            fs, audio = convert_samplerate(audio)
        else:
            audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)
        audio_length = fin.getnframes() * (1 / 16000)
        fin.close()
        print('Running inference.', file=sys.stderr)
        inference_start = timer()
        transcription = ds.stt(audio, fs)
        print(transcription)
        textfile = open(savefile, 'w')
        textfile.write(transcription)
        textfile.close()

        inference_end = timer() - inference_start
        print('Inference took %0.3fs for %0.3fs audio file.' %
              (inference_end, audio_length),
              file=sys.stderr)
        input_source = 'files'

    # input_source = 'str'
    separator = '\t'
    encoding = 'utf-8'

    references = [
        os.path.join(reference_folder, i)
        for i in list_files_in_directory(transcription_folder)
    ]
    transcriptions = [
        os.path.join(transcription_folder, i[:-4] + '.txt')
        for i in list_files_in_directory(transcription_folder)
    ]

    print(len(references))
    print(len(transcriptions))
    # exit()
    # references, transcriptions = _intersection(references, transcriptions)

    ref, hyp = [], []

    if input_source == 'str':
        ref.append(reference.decode(encoding))
        hyp.append(transcription.decode(encoding))
    elif input_source == '-':
        line_n = 0
        for line in sys.stdin:
            line_n += 1
            line = line.rstrip('\n').rstrip('\r').decode(encoding)
            fields = line.split(separator)
            if len(fields) != 2:
                logging.warning('Line %d has %d fields but 2 were expected',
                                line_n, len(fields))
                continue
            ref.append(fields[0])
            hyp.append(fields[1])
    elif input_source == 'file':
        ref = load_file(reference, encoding)
        hyp = load_file(transcription, encoding)
        if len(ref) != len(hyp):
            logging.error(
                'The number of reference and transcription sentences does not '
                'match (%d vs. %d)', len(ref), len(hyp))
            exit(1)
    elif input_source == 'files':
        ref = load_file_batch(references, encoding)
        hyp = load_file_batch(transcriptions, encoding)
        if len(ref) != len(hyp):
            logging.error(
                'The number of reference and transcription sentences does not '
                'match (%d vs. %d)', len(ref), len(hyp))
            exit(1)
    else:
        logging.error('INPUT FROM "%s" NOT IMPLEMENTED', input_source)
        exit(1)

    wer_s, wer_i, wer_d, wer_n = 0, 0, 0, 0
    cer_s, cer_i, cer_d, cer_n = 0, 0, 0, 0
    sen_err = 0
    for n in range(len(ref)):
        if n % 100 == 0:
            print('processing {}'.format(n))
        # update CER statistics
        _, (s, i, d) = levenshtein(ref[n], hyp[n])
        cer_s += s
        cer_i += i
        cer_d += d
        cer_n += len(ref[n])
        # update WER statistics
        _, (s, i, d) = levenshtein(ref[n].split(), hyp[n].split())
        wer_s += s
        wer_i += i
        wer_d += d
        wer_n += len(ref[n].split())
        # update SER statistics
        if s + i + d > 0:
            sen_err += 1

    if cer_n > 0:
        print('CER: %g%%, WER: %g%%, SER: %g%%' %
              ((100.0 * (cer_s + cer_i + cer_d)) / cer_n,
               (100.0 * (wer_s + wer_i + wer_d)) / wer_n,
               (100.0 * sen_err) / len(ref)))
        # save results
        textfile = open(result_file, 'a')
        textfile.write('\n\n' + transcription_folder)
        textfile.write('\nCER: %g%%, WER: %g%%, SER: %g%%' %
                       ((100.0 * (cer_s + cer_i + cer_d)) / cer_n,
                        (100.0 * (wer_s + wer_i + wer_d)) / wer_n,
                        (100.0 * sen_err) / len(ref)))
        textfile.close()
Beispiel #27
0
 def test_levenshtein(self):
     word1 = "kitten"
     word2 = "kitchen"
     self.assertEqual(levenshtein(word1, word2), 2)
Beispiel #28
0
	def generate(self, question, babelNetCache):
		#min_d = sys.maxsize
		Q = []
		l = []
		for r in self.question_patterns.relation_to_questions:
			for q_p in self.question_patterns[r]:
				d = utils.levenshtein(question, q_p)
				q_p_pos = len(l)
				for k in range(len(l)):
					if d <= l[k]:
						q_p_pos = k
						break
				Q.insert(q_p_pos, q_p)
				l.insert(q_p_pos, d)

		# Consider first best T matches:
		T = len(Q)

		for q in Q[:T]:
			#print(q)
			Xpos = q.find("X")
			Ypos = q.find("Y")
			
			if Xpos != -1 and Ypos != -1:
			
				#print("BOTH_X_Y")
				
				# Case -- X -- Y --?
				if Xpos < Ypos:
					beforeX = q[:Xpos]
					afterX = q[Xpos+1:Ypos]
					afterY = q[Ypos+1:]
				
					conceptX_begin_idx = -1
					pp_afterx = question[Xpos:].find(afterX)
					if pp_afterx == -1:
						continue
					conceptX_end_idx = Xpos + pp_afterx
					conceptY_begin_idx = -1
					conceptY_end_idx = question.find(afterY)
				
					if question.find(beforeX) != -1:
						conceptX_begin_idx = Xpos # = len(beforeX)
					conceptY_begin_idx = conceptX_end_idx + len(afterX)
				
					#print("CONCEPTX_BEGIN_IDX:", conceptX_begin_idx)
					#print("CONCEPTX_END_IDX:", conceptX_end_idx)
					#print("CONCEPTY_BEGIN_IDX:", conceptY_begin_idx)
					#print("CONCEPTY_END_IDX:", conceptY_end_idx)
			
				# Case -- Y -- X --?
				else:
					beforeY = q[:Ypos]
					afterY = q[Ypos+1:Xpos]
					afterX = q[Xpos+1:]
					
					conceptY_begin_idx = -1
					pp_aftery = question[Ypos:].find(afterY)
					if pp_aftery == -1:
						continue
					conceptY_end_idx = Ypos + pp_aftery
					conceptX_begin_idx = -1
					conceptX_end_idx = question.find(afterX)
				
					if question.find(beforeY) != -1:
						conceptY_begin_idx = Ypos # = len(beforeY)
					conceptX_begin_idx = conceptY_end_idx + len(afterY)
				
					#print("CONCEPTY_BEGIN_IDX:", conceptY_begin_idx)
					#print("CONCEPTY_END_IDX:", conceptY_end_idx)
					#print("CONCEPTX_BEGIN_IDX:", conceptX_begin_idx)
					#print("CONCEPTX_END_IDX:", conceptX_end_idx)

				if conceptX_begin_idx == -1 or conceptX_end_idx == -1 or conceptY_begin_idx == -1 or conceptY_end_idx == -1:
					continue
				
				conceptX = question[conceptX_begin_idx:conceptX_end_idx].lower()
				conceptY = question[conceptY_begin_idx:conceptY_end_idx].lower()
				#print("conceptX:", conceptX)
				#print("conceptY:", conceptY)
			
			# Only X in the question:
			elif Ypos == -1:
				beforeX = q[:Xpos]
				afterX = q[Xpos+1:]
				
				concept_begin_idx = -1
				concept_end_idx = -1
				
				if question.find(beforeX) != -1:
					concept_begin_idx = Xpos # = len(beforeX)
				if question.find(afterX) != -1:
					concept_end_idx = len(question) - len(afterX)
				
				#print("ONLY_X")
				#print("CONCEPT_BEGIN_IDX:", concept_begin_idx)
				#print("CONCEPT_END_IDX:", concept_end_idx)

				if concept_begin_idx == -1 or concept_end_idx == -1:
					continue
				
				conceptX = question[concept_begin_idx:concept_end_idx].lower()
				#print("conceptX:", conceptX)

			# Only Y in the question:
			elif Xpos == -1:
				beforeY = q[:Ypos]
				afterY = q[Ypos+1:]
				
				concept_begin_idx = -1
				concept_end_idx = -1
				
				if question.find(beforeY) != -1:
					concept_begin_idx = Ypos
				if question.find(afterY) != -1:
					concept_end_idx = len(question) - len(afterY)
				
				#print("ONLY_Y")
				#print("CONCEPT_BEGIN_IDX:", concept_begin_idx)
				#print("CONCEPT_END_IDX:", concept_end_idx)
				
				if concept_begin_idx == -1 or concept_end_idx == -1:
					continue
					
				conceptY = question[concept_begin_idx:concept_end_idx].lower()
				#print("conceptY:", conceptY)
				
			for elem in self.knowledgeBase.kb:
				matchX = False
				matchY = False
			
				# X in the question:
				if Xpos != -1:
					c1 = elem["c1"]
					if c1.count("bn:") >= 2:
						pass
					elif "::" in c1:
						idx = c1.index("::")
						w = c1[:idx].lower()
						if conceptX == w:
							matchX = True
					elif "bn:" in c1:
						try:
							bn_conceptx = babelNetCache.cache[c1[c1.index("bn:"):]].lower()
							#print("bn_conceptx:", bn_conceptx)
							if conceptX == bn_conceptx:
								matchX = True
						except:
							pass
					elif c1.lower() == conceptX:
						matchX = True

				# Y in the question:
				if Ypos != -1:
					c2 = elem["c2"]
					if c2.count("bn:") >= 2:
						pass
					elif "::" in c2:
						idx = c2.index("::")
						w = c2[:idx].lower()
						if conceptY == w:
							matchY = True
					elif "bn:" in c2:
						try:
							bn_concepty = babelNetCache.cache[c2[c2.index("bn:"):]].lower()
							#print("bn_concepty:", bn_concepty)
							if conceptY == bn_concepty:
								matchY = True
						except:
							pass
					elif c2.lower() == conceptY:
						matchY = True

				if Xpos != -1 and Ypos != -1:
					if matchX == True and matchY == True:
						#print("XY - Match found with:")
						#print(elem)
						return elem["answer"]
				elif matchX == True or matchY == True:
					#print("Match found with:")
					#print(elem)
					return elem["answer"]

		return "I don't understand."
Beispiel #29
0
    def run(self):

        if not self.fpcalc:
            return

        logging.debug("fpcalc: %s" % self.fpcalc)

        self.db = dbapi.connect(self.dbpath)
        # lastrelease = ""
        lastdata = []
        lastquery = ""
        laststatus = 0
        starttime = time()
        stoptime = starttime + 1
        requests = 0
        while self.running:
            try:
                path, title, artist, album = self.queue.get()
            except Empty as e:
                logging.warning(e)
                continue
            except Exception as e:
                logging.error(e)
                continue
            if not path or not album:
                logging.warning("No path/album name provided")
                continue

            if requests / (stoptime - starttime) > 3:
                sleep(1)
                starttime = stoptime

            logging.info("Getting infos for %s %s" % (artist, album))
            fingerprint = ''
            duration = 0
            try:
                logging.info("Analyzing %s file" % path)
                if self.fpcalc:
                    logging.debug("fingerprint for %s" % path)
                    fpcalc_process = subprocess.Popen(
                        ["/usr/bin/fpcalc", path],
                        stdout=subprocess.PIPE,
                        stderr=subprocess.PIPE)
                    fpcalc_output = fpcalc_process.communicate()[0].split('\n')

                    duration = fpcalc_output[1][9:]
                    fingerprint = fpcalc_output[2][12:]
            except Exception as e:
                logging.error(e)

            if fingerprint:
                query = u"/v2/lookup?" \
                    "client=8XaBELgH" \
                    "&meta=recording+releasegroups" \
                    "+tracks+puids+usermeta+compress" \
                    "&duration=%s&format=json&fingerprint=%s" % \
                    (duration, fingerprint)
                if query == lastquery and laststatus == 200:
                    logging.info("Same request already occurred - skipping")

                try:
                    conn = HTTPConnection("api.acoustid.org", 80)
                    conn.request("GET", query)
                    response = conn.getresponse()
                except:
                    continue
                puid = ""
                mb_title = ""
                mb_artists = ""
                if response.status != 200:
                    continue
                try:
                    lastquery = query
                    laststatus = 200
                    results = json.loads(response.read())
                    lastdata = results["results"][0]
                    logging.debug(lastdata)
                    release = "releasegroups" in lastdata \
                        and len(lastdata) and lastdata["releasegroups"][0]
                    recording = "recordings" in lastdata \
                        and len(lastdata) and lastdata["recordings"][0]
                    score = "score" in lastdata and lastdata['score']
                    logging.debug(release)
                    logging.debug(recording)
                    if len(lastdata):
                        logging.debug("%s results found" % len(lastdata))
                        puid = 'puids' in lastdata and lastdata["puids"][0]
                        mbid = release and release['id'] \
                            or recording \
                            and recording[0]['releasegroups'][0]['id']
                        mb_title = release and release["title"] \
                            or recording \
                            and recording[0]['title']
                        mb_artists = " ".join([
                            i['name']
                            for i in (release and release["artists"]
                                      or recording and recording[0]['artists'])
                        ])
                    logging.debug("Response status: %d %s" %
                                  (response.status, response.read()))
                except Exception as e:
                    continue
                    logging.error(e)

                stoptime = time()
                requests = (requests + 1) % 3

                if score < 0.7:
                    continue

                if len(title) == len(mb_title):
                    title_distance = hamming(title, mb_title) / float(
                        len(title))
                else:
                    title_distance = levenshtein(title, mb_title) / float(
                        max(len(title), len(mb_title)))

                if len(artist) == len(mb_artists):
                    author_distance = hamming(artist, mb_artists) / float(
                        len(artist))
                else:
                    author_distance = levenshtein(artist, mb_artists) / float(
                        max(len(artist), len(mb_artists)))

                # if title_distance > 0.33 and author_distance > 0.5:
                logging.debug("distances: %s %s %s" %
                              (score, title_distance, author_distance))
                #     continue

                logging.debug("puid: %s, mbid %s" % (puid, mbid))
                with self.condition:
                    try:
                        song_id, album_id = self.db.execute(
                            "select id, album_id from song "
                            "where path = ?;", (path, )).fetchone()
                        self.db.execute(
                            "update song set puid = ?, mbid = ? "
                            "where id = ?", (puid, mbid, song_id))
                        if title_distance > 0:
                            self.db.execute(
                                "update song set title = ? "
                                "where id = ?", (mb_title, song_id))
                        self.db.commit()
                    except Exception as e:
                        logging.error(e)
        self.db.close()
Beispiel #30
0
    def run(self):

        if not self.fpcalc:
            return

        logging.debug("fpcalc: %s" % self.fpcalc)

        self.db = dbapi.connect(self.dbpath)
        # lastrelease = ""
        lastdata = []
        lastquery = ""
        laststatus = 0
        starttime = time()
        stoptime = starttime + 1
        requests = 0
        while self.running:
            try:
                path, title, artist, album = self.queue.get()
            except Empty as e:
                logging.warning(e)
                continue
            except Exception as e:
                logging.error(e)
                continue
            if not path or not album:
                logging.warning("No path/album name provided")
                continue

            if requests / (stoptime - starttime) > 3:
                sleep(1)
                starttime = stoptime

            logging.info("Getting infos for %s %s" % (artist, album))
            fingerprint = ''
            duration = 0
            try:
                logging.info("Analyzing %s file" % path)
                if self.fpcalc:
                    logging.debug("fingerprint for %s" % path)
                    fpcalc_process = subprocess.Popen(
                        ["/usr/bin/fpcalc", path],
                        stdout=subprocess.PIPE,
                        stderr=subprocess.PIPE)
                    fpcalc_output = fpcalc_process.communicate()[0].split('\n')

                    duration = fpcalc_output[1][9:]
                    fingerprint = fpcalc_output[2][12:]
            except Exception as e:
                logging.error(e)

            if fingerprint:
                query = u"/v2/lookup?" \
                    "client=8XaBELgH" \
                    "&meta=recording+releasegroups" \
                    "+tracks+puids+usermeta+compress" \
                    "&duration=%s&format=json&fingerprint=%s" % \
                    (duration, fingerprint)
                if query == lastquery and laststatus == 200:
                    logging.info("Same request already occurred - skipping")

                try:
                    conn = HTTPConnection("api.acoustid.org", 80)
                    conn.request("GET", query)
                    response = conn.getresponse()
                except:
                    continue
                puid = ""
                mb_title = ""
                mb_artists = ""
                if response.status != 200:
                    continue
                try:
                    lastquery = query
                    laststatus = 200
                    results = json.loads(response.read())
                    lastdata = results["results"][0]
                    logging.debug(lastdata)
                    release = "releasegroups" in lastdata \
                        and len(lastdata) and lastdata["releasegroups"][0]
                    recording = "recordings" in lastdata \
                        and len(lastdata) and lastdata["recordings"][0]
                    score = "score" in lastdata and lastdata['score']
                    logging.debug(release)
                    logging.debug(recording)
                    if len(lastdata):
                        logging.debug("%s results found" % len(lastdata))
                        puid = 'puids' in lastdata and lastdata["puids"][0]
                        mbid = release and release['id'] \
                            or recording \
                            and recording[0]['releasegroups'][0]['id']
                        mb_title = release and release["title"] \
                            or recording \
                            and recording[0]['title']
                        mb_artists = " ".join(
                            [i['name'] for i in (release and release["artists"]
                             or recording and recording[0]['artists'])])
                    logging.debug(
                        "Response status: %d %s" %
                        (response.status, response.read()))
                except Exception as e:
                    continue
                    logging.error(e)

                stoptime = time()
                requests = (requests + 1) % 3

                if score < 0.7:
                    continue

                if len(title) == len(mb_title):
                    title_distance = hamming(
                        title, mb_title) / float(len(title))
                else:
                    title_distance = levenshtein(
                        title, mb_title) / float(
                        max(len(title), len(mb_title)))

                if len(artist) == len(mb_artists):
                    author_distance = hamming(
                        artist, mb_artists) / float(len(artist))
                else:
                    author_distance = levenshtein(
                        artist, mb_artists) / float(
                        max(len(artist), len(mb_artists)))

                # if title_distance > 0.33 and author_distance > 0.5:
                logging.debug(
                    "distances: %s %s %s" %
                    (score, title_distance, author_distance))
                #     continue

                logging.debug("puid: %s, mbid %s" % (puid, mbid))
                with self.condition:
                    try:
                        song_id, album_id = self.db.execute(
                            "select id, album_id from song "
                            "where path = ?;", (path,)).fetchone()
                        self.db.execute(
                            "update song set puid = ?, mbid = ? "
                            "where id = ?", (puid, mbid, song_id))
                        if title_distance > 0:
                            self.db.execute(
                                "update song set title = ? "
                                "where id = ?", (mb_title, song_id))
                        self.db.commit()
                    except Exception as e:
                        logging.error(e)
        self.db.close()
Beispiel #31
0
def get_performace(dataset_name, data_set, correct_dir, incorrect_dir):
    confusion_mat = dict()

    acc = 0.0
    mean_edit_distance = 0

    # make outputs
    correct_out_path = os.path.join(correct_dir, dataset_name)
    incorrect_out_path = os.path.join(incorrect_dir, dataset_name)
    if not os.path.exists(correct_out_path):
        os.mkdir(correct_out_path)
    if not os.path.exists(incorrect_out_path):
        os.mkdir(incorrect_out_path)

    correct_results_out = open(os.path.join(correct_out_path, dataset_name + '.results'), 'w')
    incorrect_results_out = open(os.path.join(incorrect_out_path, dataset_name + '.results'), 'w')

    file_idx = 0
    num_processed_data = 0

    data_loader = torch.utils.data.DataLoader(
        data_set, batch_size=128,
        shuffle=False, num_workers=0,
        collate_fn=data_set.collate_fn)

    with torch.set_grad_enabled(False):
        for batch_idx, (inputs, targets, synths, lengths, imgpaths) in enumerate(data_loader):
            num_processed_data += inputs.size(0)
            sys.stdout.write('\r' + str(dataset_name) + ': ' + str(num_processed_data) + '/' + str(len(data_set)))

            device_inputs = inputs.to(device)
            preds = crnn(device_inputs)
            preds_steps = torch.tensor([preds.size(0)] * preds.size(1), dtype=torch.int32)

            values, indices = preds.max(2)
            indices = indices.transpose(1, 0).contiguous().view(-1)

            blank_targets = torch.empty(targets.size(0) * 2, dtype=torch.int32)
            for idx in range(targets.size(0)):
                blank_targets[2 * idx] = targets[idx]
                blank_targets[2 * idx + 1] = 0

            for idx in range(lengths.size(0)):
                step = lengths[idx].item()
                lengths[idx] = step * 2
            target_texts = encoder.decode(blank_targets, lengths)
            pred_texts = encoder.decode(indices, preds_steps)

            pred_synths = generator_g(crnn.encoder(device_inputs))
            pred_synths = pred_synths.cpu()

            for idx in range(len(target_texts)):
                # classification
                lower_pred_text = pred_texts[idx].lower()
                lower_target_text = target_texts[idx].lower()

                synth = pred_synths[idx]
                synth = synth.numpy()
                synth = synth * 255
                synth = np.transpose(synth, (1, 2, 0))
                synth = synth.astype(np.uint8)

                if lower_pred_text == lower_target_text:
                    acc += 1.0

                    # Each dataset
                    str_idx = utils.idx_to_str(file_idx)
                    tmp_str = os.path.join(correct_out_path, str_idx + '.png')
                    tmp_img = inputs[idx].numpy()
                    tmp_img = tmp_img * 255
                    tmp_img = np.transpose(tmp_img, (1, 2, 0))
                    cv2.imwrite(tmp_str, tmp_img)

                    tmp_str = os.path.join(correct_out_path, str_idx + '.jpg')
                    cv2.imwrite(tmp_str, synth)

                    correct_results_out.write(str_idx + '\t' + imgpaths[idx] + '\t'
                                      + target_texts[idx] + '\t' + pred_texts[idx] + '\n')
                else:
                    edit_distance = utils.levenshtein(lower_pred_text, lower_target_text)
                    mean_edit_distance += edit_distance

                    # Each dataset
                    str_idx = utils.idx_to_str(file_idx)
                    tmp_str = os.path.join(incorrect_out_path, str_idx + '.png')
                    tmp_img = inputs[idx].numpy()
                    tmp_img = tmp_img * 255
                    tmp_img = np.transpose(tmp_img, (1, 2, 0))
                    cv2.imwrite(tmp_str, tmp_img)

                    tmp_str = os.path.join(incorrect_out_path, str_idx + '.jpg')
                    cv2.imwrite(tmp_str, synth)

                    incorrect_results_out.write(str_idx + '\t' + imgpaths[idx] + '\t'
                                                + target_texts[idx] + '\t' + pred_texts[idx] + '\n')

                file_idx += 1

            confusion_mat = utils.get_confusion_matrix(preds=pred_texts, targets=target_texts,
                                                       confusion_dict=confusion_mat)

    correct_results_out.close()
    incorrect_results_out.close()
    acc /= float(num_processed_data)
    mean_edit_distance /= float(num_processed_data)
    print("")
    print("num. of data: " + str(num_processed_data))

    return acc, mean_edit_distance, confusion_mat
Beispiel #32
0
def validation(epoch,
               network,
               batchSize,
               set_name,
               Set,
               imageHeight,
               imageWidth,
               labels,
               num_classes,
               log_indicator,
               models_path,
               valid_writer,
               AACHEN_init=False,
               AACHEN_h5_file=[],
               dataAugmentation=False):

    nameList, inputs, targetList, seqLengths, heights, transcriptionList, transcriptionsLenList = Set

    SetSize = len(nameList)
    n_batches = ceil(SetSize / batchSize)

    nameList_copy, inputs_copy, targetList_copy, seqLengths_copy, heights_copy, transcriptionList_copy, transcriptionsLenList_copy = list(
        nameList), list(inputs), list(targetList), list(seqLengths), list(
            heights), list(transcriptionList), list(transcriptionsLenList)

    if dataAugmentation:
        inputs_copy = pack_images(inputs_copy, imageHeight, imageWidth)

    setTotalChars = np.sum(transcriptionsLenList)

    EDnorm = 0
    EDabs = 0
    totalCost = 0

    graph, saver, inputs_mask_ph, seq_len_ph, targets_ph, targets_len_ph, learning_rate_ph, n_batches_ph, setTotalChars_ph, previousEDabs_ph, previousEDnorm_ph, previousCost_ph, optimizer, batch_cost, cost, errors, ED, predictions, merged = network.create(
        imageHeight, imageWidth, num_classes, True)

    if type(inputs_mask_ph) == list:
        mask_ph = inputs_mask_ph[1]
        inputs_ph = inputs_mask_ph[0]
    else:
        inputs_ph = inputs_mask_ph

    if type(saver) == list:
        saver = saver[0]

    with tf.Session(graph=graph) as sess:

        if AACHEN_init:
            text = ('\nInitializing weights from AACHEN framework\n')
            print(text)
            log_indicator.write(text)
            init = tf.global_variables_initializer()

            feed_dict = initialize_from_AACHEN(graph, AACHEN_h5_file,
                                               log_indicator)
            sess.run(init, feed_dict=feed_dict)

        else:
            saver.restore(sess=sess,
                          save_path=tf.train.latest_checkpoint(models_path))

        valid_start = time.time()
        prev_percent = -1

        text = '\n' * 4 + "Muestras epoch " + str(
            epoch) + " in " + set_name + " set.\n"
        print(text)
        log_indicator.write(text)

        word_errors = 0
        num_words = 0

        for batch in range(n_batches):
            BatchNameList, BatchInputs, BatchTargetSparse, BatchSeqLengths, BatchHeights, BatchTranscriptions, BatchTransLen = get_batch(
                batchSize, nameList_copy, inputs_copy, targetList_copy,
                seqLengths_copy, heights_copy, transcriptionList_copy,
                transcriptionsLenList_copy)

            feed = {
                inputs_ph: BatchInputs,
                targets_ph: BatchTargetSparse,
                targets_len_ph: BatchTransLen,
                seq_len_ph: BatchSeqLengths,
                n_batches_ph: n_batches,
                setTotalChars_ph: setTotalChars,
                previousEDabs_ph: EDabs,
                previousEDnorm_ph: EDnorm,
                previousCost_ph: totalCost
            }

            if type(inputs_mask_ph) == list:
                mask = np.zeros(
                    [len(BatchNameList), imageHeight, imageWidth, 1])
                for img in range(len(BatchNameList)):
                    mask[img, :BatchHeights[img], :BatchSeqLengths[img],
                         0] = np.ones(
                             [BatchHeights[img], BatchSeqLengths[img]])
                feed[mask_ph] = mask

            summary, batchCost, totalCost, [
                EDnorm, EDabs
            ], BatchOutpusSparse, errors_output = sess.run(
                [merged, batch_cost, cost, ED, predictions[0], errors], feed)

            BatchOutput = sess.run(
                tf.sparse_tensor_to_dense(tf.SparseTensor(
                    BatchOutpusSparse.indices, BatchOutpusSparse.values,
                    BatchOutpusSparse.dense_shape),
                                          default_value=num_classes))
            labels[num_classes] = ' '
            for ind in range(len(BatchNameList)):
                obtained_transcription = ' '.join(
                    list(map(labels.get, list(BatchOutput[ind])))).strip()
                text = str('| Name:').ljust(10) + str(
                    BatchNameList[ind]
                ).rjust(15) + ' | ' + str("Target:").ljust(10) + ''.join(
                    BatchTranscriptions[ind]).rjust(100) + " |\n" + str(
                        '| Errors: ').ljust(10) + str(
                            errors_output[ind]).rjust(15) + ' | ' + str(
                                "Output:").ljust(10) + str(
                                    obtained_transcription).rjust(
                                        100) + ' |\n' + '-' * 88 + '\n'
                print(text)
                log_indicator.write(text)

                word_errors += levenshtein(
                    ''.join(BatchTranscriptions[ind].split()).split('|'),
                    ''.join(obtained_transcription.split()).split('|'))
                num_words += len(''.join(
                    BatchTranscriptions[ind].split()).split('|'))

            batch_end = time.time()
            time_elapsed = floor(1000 * (batch_end - valid_start)) / 1000
            prev_percent = floor(10000 * (batch + 1) / n_batches) / 100
            remaining_time = floor(
                1000 * (100 * (time_elapsed + eps) /
                        (prev_percent + eps) - time_elapsed)) / 1000
            print('Epoch ' + str(epoch) + '. Evaluated ' +
                  str(len(BatchNameList)) + ' sequences in batch ' +
                  str(batch + 1) + '/' + str(n_batches) + '. Cost Function: ' +
                  str(batchCost) + '.\nTime elapsed: ' +
                  seconds_to_days_hours_min_sec(time_elapsed) +
                  '. Remaining time: ' +
                  seconds_to_days_hours_min_sec(remaining_time) + '\n')
            print('[' + int(prev_percent) * '|' +
                  (100 - int(prev_percent)) * ' ' + '] ' + str(prev_percent) +
                  '%\n')

        WER = word_errors / num_words
        valid_writer.add_summary(summary, epoch)

        print_valid_results(epoch, set_name, SetSize, totalCost,
                            [EDnorm, EDabs], WER, log_indicator)
Beispiel #33
0
                batch_size = int(true_len.shape[0])
                output = model(img)  #[w, bs, 1782]
                torch.cuda.empty_cache()
                seq_len = torch.tensor([output.shape[0]] * output.shape[1])
                loss = criterion(log_probs=output,
                                 targets=targets,
                                 input_lengths=seq_len,
                                 target_lengths=true_len)
                valid_loss += loss.cpu().item()
                output = output.detach().permute(
                    1, 0, 2)  #[bs, seq_len, |vocs|+1(blank)]

                decoded, max_probs = decoder.decode(output, true_len)
                for i in range(targets.shape[0]):
                    target = targets[i][:true_len[i]].cpu().numpy().tolist()
                    distance = levenshtein(target, decoded[i])
                    target = ' '.join(list(map(str, target)))
                    # target = decoder.convert_np_to_string(target)
                    val_edit += distance
                    val_len += true_len[i].cpu().item()
                    if idx == 0:
                        dist_list.append(distance)
                        target_list.append(target)

            # writer.add_text('target', target_list[0])
            pred = ' '.join(list(map(str, decoded[0])))
            writer.add_text(
                'Result',
                'Decode: {} \n Target: {}'.format(pred, target_list[0]), epoch)
            # print('targets:',target_list[i])
            # print('decoded:',decoded[i])
Beispiel #34
0
            bashCommand = "/exp/sw/kaldi/tools/sctk/bin/rover -f 1 -a 0 -c {} -h {} ctm -h {} ctm -o {} -m maxconf".format(
                conf, service_ctm_path, ds2_ctm_path, output_ctm_path)
            process = subprocess.Popen(bashCommand.split(),
                                       stdout=subprocess.PIPE)
            output, error = process.communicate()

            fname2transcript = get_transcripts(output_ctm_path)
            dump_dir = '../dumps/decode/{}/char/'.format(accent)
            with open(join(dump_dir, 'ref_wrds_{}.txt'.format(part))) as fd:
                refs = fd.read().splitlines()

            total_wer = 0.0
            total_tokens = 0
            for ref, fname in zip(refs, valid_fnames):
                hyp = fname2transcript[fname]
                total_wer += levenshtein(ref.split(), hyp.split())
                total_tokens += len(ref.split())

            curr_wer = total_wer / total_tokens
            if curr_wer < best_wer:
                best_wer = curr_wer
                best_conf = conf

        print('Best NULL Conf: {}, WER: {}'.format(best_conf, best_wer))

    else:

        output_ctm_path = join(rover_dir, 'out_{}.ctm'.format(part))
        bashCommand = "/exp/sw/kaldi/tools/sctk/bin/rover -f 1 -a 0 -c {} -h {} ctm -h {} ctm -o {} -m maxconf".format(
            args.conf, service_ctm_path, ds2_ctm_path, output_ctm_path)
        process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE)
Beispiel #35
0
            gt = line.strip()
            if gt in train_vocab:
                continue
            if opt.lower:
                gt = gt.lower()
                pred = pred.lower()
            if opt.alnum:
                pattern = re.compile('[\W_]+')
                gt = pattern.sub('', gt)
                pred = pattern.sub('', pred)
                # pdb.set_trace()
                # gt =
            # print('before')
            if gt != pred:
                ww += 1
                wc += levenshtein(gt, pred)
                word_lens.append(len(gt))
                print(gt, pred, wc)
            tc += len(gt)
            tw += 1
else:
    for i, line in enumerate(f):
        if i % 2 == 0:
            pred = line.strip()
        else:
            gt = line.strip()
            gt = clean(gt)
            pred = clean(pred)
            gt_w = gt.split()
            pred_w = pred.split()
            for j in range(len(gt_w)):