def log(wavs_test, texts_test, texts_pred, log_folder):
        wer = np.mean([
            calculate_wer(test.split(), pred.split())
            for test, pred in zip(texts_test, texts_pred)
        ])
        wer = np.round(wer, 4)
        result = {"WER": wer}
        content = json.dumps(result, ensure_ascii=False)
        log_file = join(log_folder, "result.json")
        write(log_file, content)
        wav_folder = join(log_folder, "wav")
        try:
            shutil.rmtree(wav_folder)
        except:
            pass
        finally:
            os.mkdir(wav_folder)
        for wav in wavs_test:
            new_path = join(wav_folder, basename(wav))
            shutil.copyfile(wav, new_path)
        wavs_test_new_path = [join("wav", basename(wav)) for wav in wavs_test]
        speech_recognition = {
            "texts_test": texts_test,
            "texts_pred": texts_pred,
            "wavs_test": wavs_test_new_path,
        }
        content = json.dumps(speech_recognition, ensure_ascii=False)
        log_file = join(log_folder, "speechrecognition.json")
        write(log_file, content)

        print("Result is written in {}".format(log_file))
        print("WER: {}%".format(wer * 100))
Beispiel #2
0
def convert_data(raw_folder, corpus_folder):
    print(raw_folder)
    for topic in listdir(raw_folder):
        print(topic)
        mkdir(join(corpus_folder, topic))
        for file in listdir(join(raw_folder, topic)):
            content = read_utf16(join(raw_folder, topic, file))
            write(join(corpus_folder, topic, file), content)
    def save(self, folder):
        """save corpus to files

        :param str folder: path to directory
        :type folder: string
        """
        try:
            mkdir(folder)
        except Exception:
            pass
        for document in self.documents:
            filename = join(folder, document.id)
            content = u"\n".join(document.sentences)
            write(filename, content)
Beispiel #4
0
    def save(self, folder, format):
        """save wscorpus to files

        :param str folder: path to directory
        :type folder: string
        :param str format: either TEXT or COLUMN
        :type format: str
        """
        try:
            mkdir(folder)
        except Exception as e:
            pass
        for document in self.documents:
            f = join(folder, document.id)
            content = u"\n".join(document.sentences)
            write(f, content)
Beispiel #5
0
    def fit(self, X, y, model_filename=None):
        """Fit FastText according to X, y

        Parameters:
        ----------
        X : list of text
            each item is a text
        y: list
           each item is either a label (in multi class problem) or list of
           labels (in multi label problem)
        """
        train_file = "temp.train"
        X = [x.replace("\n", " ") for x in X]
        y = [item[0] for item in y]
        y = [_.replace(" ", "-") for _ in y]
        lines = ["__label__{} , {}".format(j, i) for i, j in zip(X, y)]
        content = "\n".join(lines)
        write(train_file, content)
        if model_filename:
            self.estimator = ft.supervised(train_file, model_filename)
        else:
            self.estimator = ft.supervised(train_file)
        os.remove(train_file)
Beispiel #6
0
def save_temp(id, output):
    temp_file = join(samples_dir, "%s.correct" % id)
    content = u"\n".join([u"\t".join(item) for item in output])
    write(temp_file, content)
Beispiel #7
0
    text = "\n".join(lines)
    return text


def extract_sentence(content):
    return "# " + " ".join(
        [token.split("\t")[0] for token in content.split("\n")])


if __name__ == '__main__':
    test_dir = join(dirname(__file__), "test_set")
    files = [f for f in listdir(test_dir) if isfile(join(test_dir, f))]
    model_id = "1"
    try:
        shutil.rmtree(join(test_dir, model_id))
    except:
        pass
    mkdir(join(test_dir, model_id))
    for f in files:
        input = load_input(join(test_dir, f))
        output = chunk(input)
        actual = "\n".join(["\t".join(tokens) for tokens in chunk(input)])
        expected = load_output(join(test_dir, f))
        if actual != expected:
            print("\n{}".format(f))
            diff = '\n'.join(ndiff(expected.splitlines(), actual.splitlines()))
            write(join(test_dir, model_id, f),
                  "\n".join([extract_sentence(actual), actual]))
            write(join(test_dir, model_id, f + ".diff"),
                  "\n".join([extract_sentence(actual), diff]))
Beispiel #8
0
import requests
import json
from os.path import join

from underthesea.feature_engineering.text import Text
from underthesea.util.file_io import write

url = "http://localhost:8000/api/corpora/"
headers = {
    'Content-type': 'application/json',
    'Accept': 'application/json'}
r = requests.get(url, headers=headers)
content = Text(json.dumps(r.json(), ensure_ascii=False))
write(join("data", "20171017.json"), content)
Beispiel #9
0
                TP += 1
            else:
                FN += 1
        else:
            if label in y_pred[i]:
                FP += 1
            else:
                TN += 1
    score[label] = {
        "TP": TP,
        "FP": FP,
        "TN": TN,
        "FN": FN,
        "accuracy": accuracy_score(TP, FP, TN, FN),
        "precision": precision_score(TP, FP, TN, FN),
        "recall": recall_score(TP, FP, TN, FN),
        "f1": f1_score(TP, FP, TN, FN),
    }

df = pd.DataFrame.from_dict(score)
df.T.to_excel(
    "inspect/score.xlsx",
    columns=["TP", "TN", "FP", "FN", "accuracy", "precision", "recall", "f1"])

# generate result
result = {"X_test": X_test, "y_test": y_test, "y_pred": y_pred, "score": score}

print(score)
content = json.dumps(result, ensure_ascii=False)
write("inspect/result.json", content)
Beispiel #10
0
def save_temp(id, output):
    test_dir = join(dirname(__file__), "samples", "accuracy")
    temp_file = join(test_dir, "%s.tmp" % id)
    content = u"\n".join(output)
    write(temp_file, content)