Ejemplo n.º 1
0
def gen_csv():

    csv_train = []
    for subdir, dirs, files in os.walk(rootdir):
        for file in files:
            if (".csv" in file and file.startswith("metadata")):
                file_dir = os.path.join(subdir, file)
                #Too many csvs we are short on memory
                #os.system("mv "+file_dir +" /data/home/GPUAdmin1/asr/M-AILABS/csvs/" + file)
                with open(file_dir) as csv_file:
                    csv_reader = csv.reader(csv_file, delimiter='|')
                    for row in csv_reader:
                        #print("filename: " + row[0])
                        #print("transcript: " + row[2])
                        filename = row[0]
                        transcript = row[2]
                        transcript = clean_sentence(transcript)
                        wav_file_dir = "/speech/M-AILABS/" + filename + ".wav"
                        if (os.path.exists(wav_file_dir)):
                            csv_train.append((wav_file_dir, transcript))

    df = pandas.DataFrame(data=csv_train)
    output_file = "/data/home/GPUAdmin1/asr/train_csvs/M-AILABS_train.csv"
    df.to_csv(output_file, header=False, index=False, sep=",")

    #create dict from csvs
    """
Ejemplo n.º 2
0
def gen_swc_csv(root_dir=dir):

    csv = []

    with open("transcriptions.txt", 'r') as f:
        lines = f.readlines()

    i = 0
    for line in lines:
        i += 1
        file_name = line.split(" ", 1)[0]
        file_text = line.split(" ", 1)[1]

        sentence = file_text.split(" ")
        if len(sentence) <= 2:
            continue

        trans = clean_sentence(file_text)
        file_path = os.path.join(root_dir, file_name + ".wav")
        csv.append((file_path, trans))
        print("File " + str(i) + " / " + str(len(lines)), end='\r')

    print()
    print("Writing CSV File:")
    df = pandas.DataFrame(data=csv)
    output_file = "/home/GPUAdmin1/asr/train_csvs/swc_train.csv"
    df.to_csv(output_file, header=False, index=False, sep=",")
Ejemplo n.º 3
0
def rename_utterances_and_gen_csv(root_dir=dir):

    wav_files = os.path.join(root_dir, "wav_files")
    valid_wav = os.path.join(root_dir, "valid_wav")

    validated_tsv = os.path.join(root_dir, "validated.tsv")

    csv_data = []
    speakers_dict = get_dict_speakers()

    with open(validated_tsv) as f:
        lines = csv.reader(f, delimiter='\t')
        next(lines, None)
        i = 0
        for line in lines:
            client_id = line[0]
            speaker = speakers_dict.get(client_id)

            src = os.path.join(wav_files, line[1] + ".wav")
            dst = os.path.join(
                valid_wav,
                "spk{0:0=4d}".format(speaker) + "_utt{0:0=6d}.wav".format(i))
            shutil.copy(src, dst)

            trans = clean_sentence(line[2])
            csv_data.append((dst, trans))
            i += 1
            print("Renaming: " + str(i) + " / 277603 ", end="\r")

    sorted_csv = sorted(csv_data, key=lambda tup: tup[0])
    df = pandas.DataFrame(data=sorted_csv)
    output_file = "/speech/common_voice_de/common_voice_valid_wav.csv"
    df.to_csv(output_file, header=False, index=False, sep=",")
Ejemplo n.º 4
0
def convert_to_wav(root_dir=dir):

    valid_wav = os.path.join(root_dir, "valid_wav")

    if not os.path.exists(valid_wav):
        os.makedirs(valid_wav)

    validated_tsv = os.path.join(root_dir, "validated.tsv")
    valid_data = []

    with open(validated_tsv) as f:
        lines = csv.reader(f, delimiter='\t')
        next(lines, None)
        total = len(list(lines))
        i = 0
        for line in lines:
            i += 1
            src = os.path.join(root_dir, "clips", line[1] + ".mp3")
            dst = os.path.join(valid_wav, line[1] + ".wav")
            trans = clean_sentence(line[2])
            valid_data.append((dst, trans))
            # convert wav to mp3
            sound = AudioSegment.from_mp3(src)
            sound = sound.set_frame_rate(16000)
            sound.export(dst, format="wav")
            print(str(i), end='\r')
            print("Converting files: " + str(i) + " / " + str(total), end="\r")

    df = pandas.DataFrame(data=valid_data)
    output_file = "/speech/common_voice_de/common_voice_valid_wav.csv"
    df.to_csv(output_file, header=False, index=False, sep=",")
Ejemplo n.º 5
0
def generate_csv():
 
    paths = ["test", "dev", "train"]
    
    for path in paths:
        
        csv = []
        files = [
            f
            for f in listdir(join(directory, path))
            if isfile(join(directory, path, f))
        ]
        dir_path = os.path.join(directory, path)
        processed_files = 0
        total_files = len(files)

        for file in files:

            file_path = os.path.join(dir_path, file)
            processed_files+=1
            print("Processing " + path + " " + str(processed_files) + "/" + str(total_files), end="\r")
            if file.endswith(".xml"):
                tree = ET.parse(file_path)
                recording = tree.getroot()
                sent = recording.find("cleaned_sentence")
                sent = sent.text.lower()
                transcript = clean_sentence(sent)


                file_xml, _ = file.split(".", 1)
                found = 0
                for wav_file in files:
                    if wav_file.startswith(file_xml) and wav_file.endswith(".wav"):

                        wav_file_dir = os.path.join(dir_path, wav_file)
                        csv.append((wav_file_dir, transcript))
                        found += 1
                    #remove that check if you keep more than 2 microphones    
                    #if found >= 2:
                    if found >= 5:
                        break

        print()
        output_file = os.path.join(directory, path + ".csv")

        with open(output_file, 'w') as f:
            for line in csv:
                f.write(line[0]+","+line[1] + "\n")

        
        print("Successfully generated csv file {}.csv".format(path))
        print("=====================")
def preprocess_wem(tuplist):  # inputs were formerly: (tuplist, start, limit)
    '''This function cleans and tokenizes sentences, removing punctuation and numbers and making words into lower-case stems.
    Inputs: list of four-element tuples, the last element of which holds the long string of text we care about;
        an integer limit (bypassed when set to -1) indicating the DF row index on which to stop the function (for testing purposes),
        and similarly, an integer start (bypassed when set to -1) indicating the DF row index on which to start the function (for testing purposes).
    This function loops over five nested levels, which from high to low are: row, tuple, chunk, sentence, word.
    Note: This approach maintains accurate semantic distances by keeping stopwords.'''

    global mpdo  # Check if we're doing multiprocessing. If so, then mpdo=True
    global sents_combined  # Grants access to variable holding a list of lists of words, where each list of words represents a sentence in its original order (only relevant for this function if we're not using multiprocessing)
    global pcount  # Grants access to preprocessing counter

    known_pages = set()  # Initialize list of known pages for a school
    sents_combined = []  # Initialize list of all school's sentences

    if type(tuplist) == float:
        return  # Can't iterate over floats, so exit

    #print('Parsing school #' + str(pcount)) # Print number of school being parsed

    for tup in tuplist:  # Iterate over tuples in tuplist (list of tuples)
        if tup[3] in known_pages or tup == '':  # Could use hashing to speed up comparison: hashlib.sha224(tup[3].encode()).hexdigest()
            continue  # Skip this page if exactly the same as a previous page on this school's website

        for chunk in tup[3].split('\n'):
            for sent in sent_tokenize(
                    chunk
            ):  # Tokenize chunk by sentences (in case >1 sentence in chunk)
                #sent = clean_sentence(sent, fast=True) # Clean and tokenize sentence
                sent = clean_sentence(sent)
                if (
                    (sent == []) or (len(sent) == 0)
                ):  # If sentence is empty, continue to next sentence without appending
                    continue

                # TO DO: Chunk this by school, not just sentence
                # TO DO: Now that sentences are parsed and cleaned by spaces,
                # recombine and then parse more accurately using spacy word tokenizer

                # Save preprocessing sentence to object (if not multiprocessing)
                #sents_combined.append(sent) # add sent to object #if nested works
                sents_combined.extend(sent)  # if nested version doesnt work

        known_pages.add(tup[3])

    school_sentslist.append(sents_combined)  # add sent to object

    #pcount += 1 # Add to counter

    return sents_combined
def gen_csv(root_dir = dir):

    csv_list = []

    trans = root_dir + "transcript.txt"
    error = 0
    with open(trans, 'r') as f:
        lines = csv.reader(f, delimiter='|')

        i=0
        for line in lines:
            i+=1
            path = join(root_dir, line[0])
            text = line[2]

            clean_text = clean_sentence(text)
            csv_list.append( (path, clean_text) )
            print("File " +  str(i) + " / 7427", end='\r')

    print()
    print("Writing CSV File:")
    df = pandas.DataFrame(data=csv_list)
    output_file = "/home/GPUAdmin1/asr/train_csvs/single_speaker.csv"
    df.to_csv(output_file, header=False, index=False, sep=",")
def preprocess_wem2(ls):
    '''This function cleans and tokenizes sentences, removing punctuation and numbers and making words into lower-case stems.
    Inputs: list of strings;
    This function loops over all elements in the input list given, cleans the texts and returns one string'''

    global mpdo  # Check if we're doing multiprocessing. If so, then mpdo=True
    global sents_combined  # Grants access to variable holding a list of lists of words, where each list of words represents a sentence in its original order (only relevant for this function if we're not using multiprocessing)
    global pcount  # Grants access to preprocessing counter

    known_pages = set()  # Initialize list of known pages for a school
    sents_combined = []  # Initialize list of all school's sentences

    #print('Parsing school #' + str(pcount)) # Print number of school being parsed

    for s in ls:  # Iterate over tuples in tuplist (list of tuples)
        for chunk in s.split('\n'):
            for sent in sent_tokenize(
                    chunk
            ):  # Tokenize chunk by sentences (in case >1 sentence in chunk)
                #sent = clean_sentence(sent, fast=True) # Clean and tokenize sentence
                sent = clean_sentence(sent)
                if (
                    (sent == []) or (len(sent) == 0)
                ):  # If sentence is empty, continue to next sentence without appending
                    continue

                # TO DO: Chunk this by school, not just sentence
                # TO DO: Now that sentences are parsed and cleaned by spaces,
                # recombine and then parse more accurately using spacy word tokenizer

                # Save preprocessing sentence to object (if not multiprocessing)
                #sents_combined.append(sent) # add sent to object #if nested works
                sents_combined.extend(sent)  # if nested version doesnt work
    school_sentslist.append(sents_combined)  # add sent to object

    return sents_combined
Ejemplo n.º 9
0
punctstr = punctstr_make()

print("Stopwords, Unicodes, Punctuations lists creation complete!")


#word2vec computation
whole_text = []
s_count = 0 #initializing count for number of schools' texts appended
for school in df['text']:
    s_count += 1
    if s_count % 10000 == 0:
        print("Processed: ", s_count, " Schools' texts.")
    for chunk in school.split("\n"):
        for sent in sent_tokenize(chunk):
<<<<<<< HEAD
            sent = clean_sentence(sent)
=======
            sent = clean_sentence(sent, unhyphenate=True, remove_propernouns=False)
>>>>>>> 8a929bf... cleaning up and normalizing WEMs and N-Gram work from summer 2019
            sent = [word for word in sent if word != '']
            if len(sent) > 0:
                whole_text.append(sent)

print("Text appending/processing complete!")

#defining directory locations to save word embedding model/vocab
cwd = os.getcwd()
<<<<<<< HEAD
model_path = cwd + "/wem_model_300d.bin"
vocab_path = cwd + "/wem_vocab_300d.txt"
Ejemplo n.º 10
0
def split_text(
    in_file: str,
    out_file: str,
    vocabulary: List[str] = None,
    language='eng',
    remove_square_brackets=True,
    do_lower_case=True,
    min_length=20,
):
    """
    Breaks down the in_file into sentences. Each sentence will be on a separate line.
    Also replaces numbers with a simple spoken equivalent based on NUMBERS_TO_<lang> map and removes punctuation

    Args:
        in_file: path to original transcript
        out_file: path to the output file
        vocabulary: ASR model vocabulary
        language: text language
        remove_square_brackets: Set to True if square brackets [] should be removed from text.
            Text in square brackets often contains unaudibale fragments like notes or translations
        do_lower_case: flag that determines whether to apply lower case to the in_file text
    """

    print(f'Splitting text in {in_file} into sentences.')
    with open(in_file, "r") as f:
        transcript = f.read()

    # remove some symbols for better split into sentences
    transcript = (transcript.replace("\n", " ").replace("\t", " ").replace(
        "…", "...").replace("»", "").replace("«",
                                             "").replace("\\", "").replace(
                                                 "”", "").replace("„", ""))
    # remove extra space
    transcript = re.sub(r' +', ' ', transcript)

    if remove_square_brackets:
        transcript = re.sub(r'(\[.*?\])', ' ', transcript)

    # Read and split transcript by utterance (roughly, sentences)
    split_pattern = "(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<![A-Z]\.)(?<=\.|\?|\!)\s"

    if language == 'ru':
        lower_case_ru_letters_unicode = '\u0430-\u04FF'
        upper_case_ru_letters_unicode = '\u0410-\u042F'
        # remove space in the middle of the lower case abbreviation to avoid spliting into separate sentences
        matches = re.findall(r'[a-z\u0430-\u04FF]\.\s[a-z\u0430-\u04FF]\.',
                             transcript)
        for match in matches:
            transcript = transcript.replace(match, match.replace('. ', '.'))

        split_pattern = ("(?<!\w\.\w.)(?<![A-Z" +
                         upper_case_ru_letters_unicode + "][a-z" +
                         lower_case_ru_letters_unicode + "]\.)(?<![" +
                         upper_case_ru_letters_unicode + "]\.)(?<=\.|\?|\!)\s")
    elif language not in ['ru', 'eng']:
        print(
            f'Consider using {language} unicode letters for better sentence split.'
        )

    sentences = re.split(split_pattern, transcript)
    sentences_comb = []

    # adds a short sentence to the previous one
    for i in range(len(sentences)):
        if len(sentences[i]) < min_length and len(sentences_comb) > 0:
            sentences_comb[-1] += ' ' + sentences[i].strip()
        else:
            sentences_comb.append(sentences[i].strip())

    sentences = "\n".join([s.strip() for s in sentences_comb if s])

    # save split text with original punctuation and case
    out_dir, out_file_name = os.path.split(out_file)
    with open(os.path.join(out_dir, out_file_name[:-4] + '_with_punct.txt'),
              "w") as f:
        f.write(sentences)

    # substitute common abbreviations before applying lower case
    if language == 'ru':
        for k, v in RU_ABBREVIATIONS.items():
            sentences = sentences.replace(k, v)

    if do_lower_case:
        sentences = sentences.lower()

    print(len(sentences.split('\n')))

    sentences = '\n'.join(
        [clean_sentence(sentence) for sentence in sentences.split('\n')])

    # if language == 'eng':
    #     # for k, v in NUMBERS_TO_ENG.items():
    #     #     sentences = sentences.replace(k, v)
    #     # remove non acsii characters
    #     sentences = ''.join(i for i in sentences if ord(i) < 128)
    # elif language == 'ru':
    #     if vocabulary and '-' not in vocabulary:
    #         sentences = sentences.replace('-', ' ')
    #     for k, v in NUMBERS_TO_RU.items():
    #         sentences = sentences.replace(k, v)
    #     # replace Latin characters with Russian
    #     for k, v in LATIN_TO_RU.items():
    #         sentences = sentences.replace(k, v)
    #
    # # make sure to leave punctuation present in vocabulary
    # all_punct_marks = string.punctuation + "–—’“”"
    # if vocabulary:
    #     for v in vocabulary:
    #         all_punct_marks = all_punct_marks.replace(v, '')
    # sentences = re.sub("[" + all_punct_marks + "]", "", sentences).strip()

    with open(out_file, "w") as f:
        f.write(sentences)
Ejemplo n.º 11
0
from clean_text import clean_sentence

with open("/lm_corpus/German_sentences_8mil_filtered_maryfied.txt",
          "r") as text:
    with open("/lm_corpus/mary.txt", "w") as out_file:
        for line in text:
            sent = clean_sentence(line)
            out_file.write(sent + "\n")
Ejemplo n.º 12
0
]
files_out = [
    rootdir + "test_csvs/cv_test.csv", rootdir + "dev_csvs/cv_dev.csv",
    rootdir + "test_csvs/tuda_test.csv", rootdir + "dev_csvs/tuda_dev.csv"
]
sentences = []
for file_dir in files:
    if (".csv" in file_dir):
        with open(file_dir) as csv_file:
            csv_reader = csv.reader(csv_file, delimiter=',')
            for row in csv_reader:
                sentences.append(row[1] + "\n")
    else:
        with open(file_dir, "r") as text:
            for line in text:
                sent = clean_sentence(line.split(" ", 1)[1])
                sentences.append(sent + "\n")

sentences_out = []
for file_dir in files_out:
    with open(file_dir) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        for row in csv_reader:
            sentences_out.append(row[1] + "\n")

sent_set = set(sentences)
corpus = open("/data/home/GPUAdmin1/asr/corpus.txt", "w")
for sent in sent_set:
    if (sent in sentences_out):
        continue
    corpus.write(sent)
Ejemplo n.º 13
0
 # --TODO remove things between [] and ♪
 if "[" in transcript and "]" in transcript:
     transcript = clean(transcript, "[", "]")
 if "(" in transcript and ")" in transcript:
     transcript = clean(transcript, "(", ")")
 if "<" in transcript and ">" in transcript:
     transcript = clean(transcript, "<", ">")
 if "*" in transcript and "*" in transcript.split("*", 1)[1]:
     transcript = clean(transcript, "*")
 if "♪" in transcript and "♪" in transcript.split("♪", 1)[1]:
     transcript = clean(transcript, "♪")
 transcript = transcript.replace("- ", "")
 transcript = transcript.replace("-", "")
 transcript = transcript.replace('"', "")
 transcript = transcript.replace("'", "")
 transcriptclean = clean_sentence(transcript)
 # --TODO if the whole wav is non talk igneore it i.e continue
 if (
     # TODO continue after Parfum
     transcriptclean.strip() == ""
     or transcriptclean.strip() == "nina deggert thomas deggert"
     or transcriptclean.strip() == "dr friedrich kronberg"
     or transcriptclean.strip() == "operation juninacht"
     or transcriptclean.strip() == "monika schöllack"
     or transcriptclean.strip() == "spricht polnisch"
     or transcriptclean.strip() == "telefon"
     or transcriptclean.strip() == "hupen"
     or transcript.strip() == "Ostberlin 1980"
     or "hassans vater spricht arabisch" in transcriptclean
     or "singt schlaflied auf polnisch" in transcriptclean
     or "reifen quietschen" in transcriptclean
def nltk_tokenize(rootdir=dir,
                  output_root="/lm_corpus/dewiki_nltk_segmented/"):

    paths = listdir(rootdir)

    exists = os.path.isdir(output_root)
    if not exists:
        os.mkdir(output_root)

    total_paths = len(paths)
    current_path = 0

    for path in paths:

        output_dir = join(output_root, path)
        exists = os.path.isdir(output_dir)
        if not exists:
            os.mkdir(output_dir)

        files = [
            f for f in listdir(join(rootdir, path))
            if isfile(join(rootdir, path, f))
        ]

        current_path += 1
        total_files = len(files)
        processed_files = 0

        for file in files:

            file_path = join(rootdir, path, file)
            new_file_name = join(output_dir, file + ".txt")

            processed_files += 1
            print("Processing path " + path + " " + str(current_path) + "/" +
                  str(total_paths) + " Files: " + str(processed_files) + "/" +
                  str(total_files),
                  end="\r")

            with open(file_path, 'r+', encoding='utf-8') as f:
                with open(new_file_name, 'w', encoding='utf-8') as new_file:
                    doc = ""
                    skip_header = False

                    while (True):
                        line = f.readline()
                        if not line:
                            doc = ""
                            break

                        if skip_header:
                            skip_header = False
                            continue

                        if "<doc id=" in line:
                            skip_header = True
                            continue
                        if not line.strip():
                            continue

                        if "</doc>" in line:
                            sentences = sent_tokenize(doc)
                            for j in range(len(sentences)):
                                clean_sent = clean_sentence(sentences[j])
                                clean_sent = ' '.join(clean_sent.split())
                                new_file.write(clean_sent + '\n')
                            doc = ""
                        else:
                            doc = doc + line
def spacy_tokenize(rootdir=dir,
                   output_root="/lm_corpus/dewiki_spacy_segmented/"):

    nlp = spacy.load('de')
    paths = listdir(rootdir)

    exists = os.path.isdir(output_root)
    if not exists:
        os.mkdir(output_root)

    total_paths = len(paths)
    current_path = 0

    for path in paths:

        output_dir = join(output_root, path)
        exists = os.path.isdir(output_dir)
        if not exists:
            os.mkdir(output_dir)

        files = [
            f for f in listdir(join(rootdir, path))
            if isfile(join(rootdir, path, f))
        ]

        current_path += 1
        total_files = len(files)
        processed_files = 0

        for file in files:

            file_path = join(rootdir, path, file)
            new_file_name = join(output_dir, file + "_spacy.txt")

            processed_files += 1
            print("Processing path " + path + " " + str(current_path) + "/" +
                  str(total_paths) + " Files: " + str(processed_files) + "/" +
                  str(total_files),
                  end="\r")

            with open(file_path, 'r+', encoding='utf-8') as f:
                with open(new_file_name, 'w', encoding='utf-8') as new_file:
                    content = f.readlines()
                    doc = ""
                    skip_header = False
                    for i in range(len(content)):

                        if skip_header:
                            skip_header = False
                            continue

                        if "<doc id=" in content[i]:
                            skip_header = True
                            continue

                        if not content[i].strip():
                            continue

                        if "</doc>" in content[i]:

                            doc = nlp(doc)
                            sentences = list(doc.sents)
                            for j in range(len(sentences)):
                                clean_sent = clean_sentence(
                                    sentences[j].string.strip())
                                #clean_sent = sentences[j].string.strip()
                                clean_sent = ' '.join(clean_sent.split())
                                new_file.write(clean_sent + '\n')
                            new_file.write('\n')
                            doc = ""

                        else:
                            doc = doc + content[i]