Python get_logger Examples, ekstep_data_pipelines.common.utils.get_logger Python Examples

Example #1

0

Show file

File: data_marker.py Project: TW-Speech/audio-to-speech-pipeline

from ekstep_data_pipelines.common.file_system.gcp_file_systen import GCPFileSystem
from ekstep_data_pipelines.common.utils import get_logger
from ekstep_data_pipelines.common import BaseProcessor

from ekstep_data_pipelines.data_marker.constants import (
    CONFIG_NAME,
    FILTER_CRITERIA,
    LANDING_BASE_PATH,
    SOURCE_BASE_PATH,
)
from ekstep_data_pipelines.data_marker.data_filter import DataFilter
from ekstep_data_pipelines.data_marker.data_mover import MediaFilesMover

ESTIMATED_CPU_SHARE = 0.02

Logger = get_logger("Data marker")


class DataMarker(BaseProcessor):
    """
    1. Load Configuration
    2. Filter data baased on criteria
    2. Tag/Mark data in the DB
    3. Move marked data
    """
    @staticmethod
    def get_instance(data_processor_instance, gcs_instance, **kwargs):
        return DataMarker(data_processor_instance, gcs_instance, **kwargs)

    def __init__(self, postgres_client, gcs_instance, **kwargs):
        self.postgres_client = postgres_client

Example #2

0

Show file

import pandas as pd

from ekstep_data_pipelines.common.utils import get_logger

Logger = get_logger("DataFilter")


class DataFilter(object):
    def exclude_audio_ids(self, utterances, audio_ids):
        excluding_audio_ids = filter(lambda t: t[0] not in audio_ids,
                                     utterances)
        return excluding_audio_ids

    def exclude_speaker_ids(self, utterances, speaker_ids):
        excluding_speaker_ids = filter(lambda t: t[0] not in speaker_ids,
                                       utterances)
        return excluding_speaker_ids

    def by_utterance_duration(self, utterances, filters):
        by_utterance_duration = filter(
            lambda t: filters["lte"] >= t[2] >= filters["gte"], utterances)
        return by_utterance_duration

    def by_snr(self, utterances, filters):
        by_snr_utterances = filter(
            lambda t: filters["lte"] >= t[4] >= filters["gte"], utterances)
        return by_snr_utterances

    def by_duration(
        self,
        utterances,

Example #3

0

Show file

File: punjabi_sanitizer.py Project: TW-Speech/audio-to-speech-pipeline

from ekstep_data_pipelines.audio_transcription.transcription_sanitizers import (
    BaseTranscriptionSanitizer, )
from ekstep_data_pipelines.audio_transcription.transcription_sanitizers.audio_transcription_errors import (
    TranscriptionSanitizationError, )

from ekstep_data_pipelines.common.utils import get_logger
import re

LOGGER = get_logger("PunjabiSanitizer")


class PunjabiSanitizer(BaseTranscriptionSanitizer):

    VALID_CHARS = "[  ਼ ਂ ੍ੑ ਾ ਿ ੀ ੁ ੂ ੇ ੈ ੋੰੱਅ-ਊਏ-ਐਓ-ਨਪ-ਰਲਲ਼ਵਸ਼ਸਹਖ਼-ੜਫ਼]+"
    PUNCTUATION = "!\"#%&'()*+,./;<=>?@[\\]^_`{|}~।"

    @staticmethod
    def get_instance(**kwargs):
        return PunjabiSanitizer()

    def __init__(self, *args, **kwargs):
        pass

    def sanitize(self, transcription):
        LOGGER.info("Sanitizing transcription:" + transcription)
        transcription = transcription.strip()

        transcription = self.replace_bad_char(transcription)

        transcription = transcription.strip()

Example #4

0

Show file

from ekstep_data_pipelines.audio_processing.generate_hash import (
    get_hash_code_of_audio_file, )
from ekstep_data_pipelines.audio_processing.constants import (
    CONFIG_NAME,
    REMOTE_RAW_FILE,
    CHUNKING_CONFIG,
    SNR_CONFIG,
    REMOTE_PROCESSED_FILE_PATH,
    MASTER_META_DATA_FILE_PATH,
    SNR_DONE_FOLDER_PATH,
    DUPLICATE_AUDIO_FOLDER_PATH,
)
from ekstep_data_pipelines.common.utils import get_logger
from ekstep_data_pipelines.common import BaseProcessor

Logger = get_logger("Audio Processor")


class AudioProcessor(BaseProcessor):
    """
    Class for breaking a downloaded file into smaller chunks of
    audio files as well as filtering out files with more than an acceptable level
    of Sound to Noise Ratio(or SNR)
    """

    DEFAULT_DOWNLOAD_PATH = "/tmp/audio_processing_raw"

    @staticmethod
    def get_instance(data_processor, gcs_instance, audio_commons,
                     catalogue_dao, **kwargs):
        return AudioProcessor(data_processor, gcs_instance, audio_commons,

Example #5

0

Show file

File: google_transcription_client.py Project: akbatra567/audio-to-speech-pipeline

import os

from google.cloud import speech_v1
from google.cloud.speech_v1 import enums
from ekstep_data_pipelines.common.audio_commons.transcription_clients.transcription_client_errors import (
    GoogleTranscriptionClientError, )
from ekstep_data_pipelines.common.utils import get_logger

LOGGER = get_logger("GoogleTranscriptionClient")


class GoogleTranscriptionClient(object):
    @staticmethod
    def get_instance(config_dict):
        google_config_dict = config_dict.get("common", {}).get(
            "google_transcription_client", {})
        return GoogleTranscriptionClient(**google_config_dict)

    def __init__(self, **config_dict):
        self.language = config_dict.get("language", "hi-IN")
        self.sample_rate = config_dict.get("sample_rate", 16000)
        self.channels = config_dict.get("audio_channel_count", 1)
        self.bucket = config_dict.get("bucket")
        self._client = None

    def make_directories(self, path):
        if not os.path.exists(path):
            LOGGER(f"Directory {path} not does already exist")
            os.makedirs(path)
            LOGGER.info("Directory %s created successfully", path)

Example #6

0

Show file

File: kannada_sanitizer.py Project: TW-Speech/audio-to-speech-pipeline

from ekstep_data_pipelines.audio_transcription.transcription_sanitizers import (
    BaseTranscriptionSanitizer, )
from ekstep_data_pipelines.audio_transcription.transcription_sanitizers.audio_transcription_errors import (
    TranscriptionSanitizationError, )

from ekstep_data_pipelines.common.utils import get_logger
import re

LOGGER = get_logger("KannadaTranscriptionSanitizer")


class KannadaSanitizer(BaseTranscriptionSanitizer):

    VALID_CHARS = "[ ಂ-ಃಅ-ಋಎ-ಐಒ-ನಪ-ರಲ-ಳವ-ಹಾ-ೄೆ-ೈೊ-್ೲ]+"
    PUNCTUATION = "!\"#%&'()*+,./;<=>?@[\\]^_`{|}~।"

    @staticmethod
    def get_instance(**kwargs):
        return KannadaSanitizer()

    def __init__(self, *args, **kwargs):
        pass

    def sanitize(self, transcription):
        LOGGER.info("Sanitizing transcription:" + transcription)
        transcription = transcription.strip()

        transcription = self.replace_bad_char(transcription)

        transcription = transcription.strip()

Example #7

0

Show file

File: create_embeddings.py Project: srajat84/audio-to-speech-pipeline

from ekstep_data_pipelines.common.utils import get_logger
from ekstep_data_pipelines.audio_analysis.speaker_analysis.speaker_clustering import (
    create_speaker_clusters, )
from ekstep_data_pipelines.audio_analysis.speaker_analysis.file_cluster_mapping import (
    speaker_to_file_name_map, )

Logger = get_logger("AudioSpeakerClusteringProcessor")


def create_embeddings(
    local_audio_download_path,
    dir_pattern,
    embed_file_path,
    fs_interface,
    npz_bucket_destination_path,
    source_name,
):
    is_uploaded = fs_interface.upload_to_location(embed_file_path,
                                                  npz_bucket_destination_path)
    if is_uploaded:
        Logger.info("npz file uploaded to :" + npz_bucket_destination_path)
    else:
        Logger.info("npz file could not be uploaded to :" +
                    npz_bucket_destination_path)
    file_map_dict, noise_file_map_dict = create_speaker_clusters(
        embed_file_path, source_name)
    speaker_to_file_name = speaker_to_file_name_map(file_map_dict)
    Logger.info("total speakers:" + str(len(speaker_to_file_name)))

Example #8

0

Show file

import re

from ekstep_data_pipelines.audio_transcription.transcription_sanitizers import (
    BaseTranscriptionSanitizer, )
from ekstep_data_pipelines.audio_transcription.transcription_sanitizers.audio_transcription_errors import (
    TranscriptionSanitizationError, )
from ekstep_data_pipelines.common.utils import get_logger

LOGGER = get_logger("MalayalamTranscriptionSanitizer")


class MalayalamSanitizer(BaseTranscriptionSanitizer):
    VALID_CHARS = "[ ം-ഃഅ-ഋഎ-ഐഒ-നപ-ഺാ-ൃെ-ൈൊ-്ൺ-ൾ]+"
    PUNCTUATION = "!\"#%&'()*+,./;<=>?@[\\]^_`{|}~।"

    @staticmethod
    def get_instance(**kwargs):
        return MalayalamSanitizer()

    def __init__(self, *args, **kwargs):
        pass

    def sanitize(self, transcription):
        LOGGER.info("Sanitizing transcription:%s", transcription)
        transcription = transcription.strip()

        transcription = self.replace_bad_char(transcription)

        transcription = transcription.strip()

        if len(transcription) == 0:

Example #9

0

Show file

# import signal
import sys
import multiprocessing
import os

from concurrent.futures import ThreadPoolExecutor

from ekstep_data_pipelines.common.utils import get_logger
from ekstep_data_pipelines.common import BaseProcessor
from ekstep_data_pipelines.audio_embedding.create_embeddings import (
    encode_each_batch)

LOGGER = get_logger("AudioEmbeddingProcessor")

ESTIMATED_CPU_SHARE = 0.1


class AudioEmbedding(BaseProcessor):
    """
    Class to identify speaker for each utterance in a source
    """

    local_txt_path = "./audio_speaker_cluster/file_path/"
    local_audio_path = "./audio_speaker_cluster/audio_files/"
    embed_file_path = "./audio_speaker_cluster/embed_file_path/"

    @staticmethod
    def get_instance(data_processor, **kwargs):
        return AudioEmbedding(data_processor, **kwargs)

    def __init__(self, data_processor, **kwargs):

Example #10

0

Show file

from ekstep_data_pipelines.audio_transcription.transcription_sanitizers import (
    BaseTranscriptionSanitizer, )
from ekstep_data_pipelines.audio_transcription.transcription_sanitizers.audio_transcription_errors import (
    TranscriptionSanitizationError, )

from ekstep_data_pipelines.common.utils import get_logger
import re

LOGGER = get_logger("IndianEnglishSanitizer")


class IndianEnglishSanitizer(BaseTranscriptionSanitizer):

    VALID_CHARS = "[ a-zA-Z0-9']"
    PUNCTUATION = '!"#%&()*+,./;<=>?@[\\]^_`{|}~ред'

    @staticmethod
    def get_instance(**kwargs):
        return IndianEnglishSanitizer()

    def __init__(self, *args, **kwargs):
        pass

    def sanitize(self, transcription):
        LOGGER.info("Sanitizing transcription:" + transcription)
        transcription = transcription.strip()

        transcription = self.replace_bad_char(transcription)

        transcription = transcription.strip()

Example #11

0

Show file

import pandas as pd

from ekstep_data_pipelines.common.utils import get_logger

from sqlalchemy import text

from ekstep_data_pipelines.common.dao.constants import (
    GET_UNIQUE_ID,
    IS_EXIST,
    COMMAND_WITH_LICENSE,
    COMMAND_WITHOUT_LICENSE,
    LICENSE,
)

LOGGER = get_logger("CatalogueDao")


class CatalogueDao:
    def __init__(self, postgres_client):
        self.postgres_client = postgres_client

    def get_utterances(self, audio_id):
        parm_dict = {"audio_id": audio_id}
        utterances = self.postgres_client.execute_query(
            "select utterances_files_list from media_metadata_staging where audio_id = :audio_id",
            **parm_dict,
        )
        return json.loads(utterances[0][0]) if len(utterances) > 0 else []

    def get_utterances_by_source(self, source, status):

Example #12

0

Show file

import re

from ekstep_data_pipelines.audio_transcription.transcription_sanitizers import (
    BaseTranscriptionSanitizer, )
from ekstep_data_pipelines.audio_transcription.transcription_sanitizers.audio_transcription_errors import (
    TranscriptionSanitizationError, )
from ekstep_data_pipelines.common.utils import get_logger

LOGGER = get_logger("UrduTranscriptionSanitizer")


class UrduSanitizer(BaseTranscriptionSanitizer):
    VALID_CHARS = "[ ء-آؤئ-بت-غف-قل-نؤٹپچڈڑژکگںھہیے-ۓ]+"
    PUNCTUATION = "!\"#%&'()*+,./;<=>?@[\\]^_`{|}~।"

    @staticmethod
    def get_instance(**kwargs):
        return UrduSanitizer()

    def __init__(self, *args, **kwargs):
        pass

    def sanitize(self, transcription):
        LOGGER.info("Sanitizing transcription:%s", transcription)
        transcription = transcription.strip()

        transcription = self.replace_bad_char(transcription)

        transcription = transcription.strip()

        if len(transcription) == 0:

Example #13

0

Show file

import json
import os
import shutil
import subprocess

import pandas as pd
from ekstep_data_pipelines.audio_language_identification.audio_language_inference import (
    infer_language,
)
from ekstep_data_pipelines.audio_processing.audio_duration import calculate_duration
from ekstep_data_pipelines.common.utils import get_logger

LOGGER = get_logger("Snr")


class SNR:
    """
    Util object for performing SNR analysis over different
    """

    MAX_DURATION = 15

    @staticmethod
    def get_instance(initialization_dict):
        feat_language_identification = initialization_dict.get(
            "audio_processor_config", {}
        ).get("feat_language_identification", False)
        LOGGER.info(
            "Running with feat_language_identification=%s",
            str(feat_language_identification),
        )

Example #14

0

Show file

File: gujrati_sanitizer.py Project: akbatra567/audio-to-speech-pipeline

from ekstep_data_pipelines.audio_transcription.transcription_sanitizers import (
    BaseTranscriptionSanitizer, )
from ekstep_data_pipelines.common.utils import get_logger

LOGGER = get_logger("GujratiTranscriptionSanitizer")


class GujratiSanitizer(BaseTranscriptionSanitizer):
    @staticmethod
    def get_instance(**kwargs):
        return GujratiSanitizer()

    def __init__(self, *args, **kwargs):
        pass

    def sanitize(self, transcription):
        pass

Example #15

0

Show file

File: data_mover.py Project: akbatra567/audio-to-speech-pipeline

from concurrent.futures import ThreadPoolExecutor

from ekstep_data_pipelines.common.utils import get_logger

Logger = get_logger("MediaFilesMover")


class MediaFilesMover(object):
    def __init__(self, file_system, concurrency):
        self.file_system = file_system
        self.concurrency = concurrency

    def move_media_files(self, files, landing_path_with_source):
        Logger.info("using concurrency:%s", str(self.concurrency))
        worker_pool = ThreadPoolExecutor(max_workers=self.concurrency)
        for file in files:
            relative_audio_id_clean_path = "/".join(file.split("/")[-3:-1])
            landing_path = f"{landing_path_with_source}/{relative_audio_id_clean_path}"
            worker_pool.submit(self.file_system.mv_file, file, landing_path)
        worker_pool.shutdown(wait=True)

Example #16

0

Show file

import re

from ekstep_data_pipelines.audio_transcription.transcription_sanitizers import (
    BaseTranscriptionSanitizer, )
from ekstep_data_pipelines.audio_transcription.transcription_sanitizers.audio_transcription_errors import (
    TranscriptionSanitizationError, )
from ekstep_data_pipelines.common.utils import get_logger

LOGGER = get_logger("BengaliTranscriptionSanitizer")


class BengaliSanitizer(BaseTranscriptionSanitizer):
    VALID_CHARS = "[ ঁ-ঃঅ-ঋএ-ঐও-নপ-রলশ-হ়া-্ে-ৈো-ৎয়]+"
    PUNCTUATION = "!\"#%&'()*+,./;<=>?@[\\]^_`{|}~।"

    @staticmethod
    def get_instance(**kwargs):
        return BengaliSanitizer()

    def __init__(self, *args, **kwargs):
        pass

    def sanitize(self, transcription):
        LOGGER.info("Sanitizing transcription:%s", transcription)
        transcription = transcription.strip()

        transcription = self.replace_bad_char(transcription)

        transcription = transcription.strip()

        if len(transcription) == 0:

Example #17

0

Show file

import glob
import os
import subprocess
import collections
import contextlib
import sys
import wave
import webrtcvad
import sox

from ekstep_data_pipelines.common.utils import get_logger

Logger = get_logger("Chunking Util")


class ChunkingConversionUtil:

    re_chunking_aggressiveness = 3

    @staticmethod
    def get_instance():
        return ChunkingConversionUtil()

    def convert_to_wav(self, input_dir, output_dir=None, ext="mp4"):

        Logger.info(f"Convert all the files in {input_dir} to wav")
        audio_paths = glob.glob(input_dir + "/*." + ext)

        Logger.info(f"Files to be completed: {audio_paths}")

        if len(audio_paths) < 1:

Example #18

0

Show file


class ACTIONS:
    DATA_MARKING = "data_marking"
    AUDIO_PROCESSING = "audio_processing"
    AUDIO_TRANSCRIPTION = "audio_transcription"
    AUDIO_ANALYSIS = "audio_analysis"
    AUDIO_CATALOGUER = "audio_cataloguer"


class FILE_SYSTEMS:
    GOOGLE = "google"
    LOCAL = "local"


LOGGER = get_logger("EKSTEP_PROCESSOR")
ACTIONS_LIST = [
    ACTIONS.DATA_MARKING,
    ACTIONS.AUDIO_PROCESSING,
    ACTIONS.AUDIO_TRANSCRIPTION,
    ACTIONS.AUDIO_ANALYSIS,
    ACTIONS.AUDIO_CATALOGUER,
]
FILES_SYSTEMS_LIST = [FILE_SYSTEMS.GOOGLE, FILE_SYSTEMS.LOCAL]
# config_bucket = 'ekstepspeechrecognition-dev'

parser = argparse.ArgumentParser(
    description="Util for data processing for EkStep")

parser.add_argument(
    "-b",

Example #19

0

Show file

from azure.cognitiveservices import speech
from ekstep_data_pipelines.common.utils import get_logger
from ekstep_data_pipelines.common.audio_commons.transcription_clients.transcription_client_errors import (
    AzureTranscriptionClientError, )

LOGGER = get_logger("AzureTranscriptionClient")


class AzureTranscriptionClient(object):
    @staticmethod
    def get_instance(config_dict):
        azure_config_dict = config_dict.get("common", {}).get(
            "azure_transcription_client", {})
        return AzureTranscriptionClient(**azure_config_dict)

    def __init__(self, **kwargs):
        self.speech_key = kwargs.get("speech_key")
        self.service_region = kwargs.get("service_region")
        self.language = kwargs.get("language", "hi-IN")
        self.speech_config = speech.SpeechConfig(subscription=self.speech_key,
                                                 region=self.service_region)

    def generate_transcription(self, language, source_file_path):
        try:
            result = self.speech_to_text(source_file_path)
        except RuntimeError as error:
            raise AzureTranscriptionClientError(error)
        return result.text

    def speech_to_text(self, audio_file_path):
        audio_input = speech.audio.AudioConfig(filename=audio_file_path)

Example #20

0

Show file

import os
from os import listdir
from os.path import isfile, join
from google.cloud import storage
from ekstep_data_pipelines.common.infra_commons.storage import BaseStorageInterface
from concurrent.futures import ThreadPoolExecutor
from ekstep_data_pipelines.common.infra_commons.storage.exceptions import (
    FileNotFoundException,
    PathDoesNotExist,
)
from ekstep_data_pipelines.common.utils import get_logger
from tqdm import tqdm

Logger = get_logger("GoogleStorage")


class GoogleStorage(BaseStorageInterface):
    def __init__(self, **kwargs):
        self._client = None

    def get_bucket_from_path(self, path) -> str:
        if not path:
            return None

        splitted_path = list(filter(None, path.split("/")))

        if len(splitted_path) < 1:
            return None

        return splitted_path[0]

Example #21

0

Show file

File: gcp_file_systen.py Project: akbatra567/audio-to-speech-pipeline

from ekstep_data_pipelines.common.utils import get_logger

Logger = get_logger("GCPFileSystem")


class GCPFileSystem:
    def __init__(self, gcp_operations):
        self.gcp_operations = gcp_operations

    def ls(self, dir_path):
        paths = self.gcp_operations.list_blobs_in_a_path(dir_path)
        return list(map(lambda p: p.name, paths))

    def mv(self, source_dir, target_dir, is_dir=True):
        if is_dir and not self.gcp_operations.check_path_exists(
                self, source_dir):
            Logger.info("source dir does not exist:%s", source_dir)
            return

        files = self.ls(source_dir)
        for file in files:
            self.mv_file(file, target_dir)

    def mv_file(self, file, target_dir):
        paths = file.split("/")
        paths.pop()
        source_dir = "/".join(paths)
        destination_blob_name = file.replace(source_dir, target_dir)
        Logger.info("Moving file %s --> %s", file, destination_blob_name)
        self.gcp_operations.move_blob(file, destination_blob_name)

Example #22

0

Show file

File: telugu_sanitizer.py Project: akbatra567/audio-to-speech-pipeline

import re

from ekstep_data_pipelines.audio_transcription.transcription_sanitizers import (
    BaseTranscriptionSanitizer,
)
from ekstep_data_pipelines.audio_transcription.transcription_sanitizers.audio_transcription_errors import (
    TranscriptionSanitizationError,
)
from ekstep_data_pipelines.common.utils import get_logger

LOGGER = get_logger("TeluguTranscriptionSanitizer")


class TeluguSanitizer(BaseTranscriptionSanitizer):
    VALID_CHARS = "[ ం-ఃఅ-ఌఎ-ఐఒ-నప-ళవ-హా-ౄె-ైొ-్ౠ]+"
    PUNCTUATION = "!\"#%&'()*+,./;<=>?@[\\]^_`{|}~।"

    @staticmethod
    def get_instance(**kwargs):
        return TeluguSanitizer()

    def __init__(self, *args, **kwargs):
        pass

    def sanitize(self, transcription):
        LOGGER.info("Sanitizing transcription:%s", transcription)
        transcription = transcription.strip()

        transcription = self.replace_bad_char(transcription)

        transcription = transcription.strip()

Example #23

0

Show file

import multiprocessing
import os
import yaml
import shutil, glob
from os import listdir
from os.path import isfile, join
from google.cloud import storage
from concurrent.futures import ThreadPoolExecutor
import datetime
from ekstep_data_pipelines.common.utils import get_logger

Logger = get_logger("GCS Operations")


class CloudStorageOperations:
    @staticmethod
    def get_instance(config_dict, **kwargs):
        gcs_instance = CloudStorageOperations(config_dict, **kwargs)
        return gcs_instance

    def __init__(self, config_dict, **kwargs):
        self.config_dict = config_dict
        self._bucket = None
        self._client = None

    @property
    def client(self):
        if self._client:
            return self._client

        self._client = storage.Client()

Example #24

0

Show file

import re

from ekstep_data_pipelines.audio_transcription.transcription_sanitizers import (
    BaseTranscriptionSanitizer,
)
from ekstep_data_pipelines.audio_transcription.transcription_sanitizers.audio_transcription_errors import (
    TranscriptionSanitizationError,
)
from ekstep_data_pipelines.common.utils import get_logger

LOGGER = get_logger("AssameseTranscriptionSanitizer")


class AssameseSanitizer(BaseTranscriptionSanitizer):
    VALID_CHARS = "[ ঁ-ঃঅ-ঋএ-ঐও-চচ-নপ-যলশ-হা-ৃে-ৈো-ৎৗড়-ঢ়য়-ৠৰ-ৱ৺]+"
    PUNCTUATION = "!\"#%&'()*+,./;<=>?@[\\]^_`{|}~।"

    @staticmethod
    def get_instance(**kwargs):
        return AssameseSanitizer()

    def __init__(self, *args, **kwargs):
        pass

    def sanitize(self, transcription):
        LOGGER.info("Sanitizing transcription:%s", transcription)
        transcription = transcription.strip()

        transcription = self.replace_bad_char(transcription)

        transcription = transcription.strip()

Example #25

0

Show file

    AUDIO_LANGUAGE,
)
from ekstep_data_pipelines.audio_transcription.transcription_sanitizers.audio_transcription_errors import (
    TranscriptionSanitizationError, )
from ekstep_data_pipelines.audio_transcription.transcription_sanitizers import (
    get_transcription_sanitizers, )
from ekstep_data_pipelines.common.audio_commons.transcription_clients.transcription_client_errors import (
    AzureTranscriptionClientError,
    GoogleTranscriptionClientError,
)
from ekstep_data_pipelines.common.file_utils import get_file_name
from ekstep_data_pipelines.common.utils import get_logger
from ekstep_data_pipelines.common import BaseProcessor
import os

LOGGER = get_logger("audio_transcription")


class AudioTranscription(BaseProcessor):
    LOCAL_PATH = None

    @staticmethod
    def get_instance(data_processor, gcs_instance, audio_commons,
                     catalogue_dao, **kwargs):
        return AudioTranscription(data_processor, gcs_instance, audio_commons,
                                  catalogue_dao, **kwargs)

    def __init__(self, data_processor, gcs_instance, audio_commons,
                 catalogue_dao, **kwargs):
        self.data_processor = data_processor
        self.gcs_instance = gcs_instance

Example #26

0

Show file

    MAX_LOAD_DATE_FOR_MEDIA_QUERY,
    INSERT_INTO_MEDIA_TABLE_QUERY,
    INSERT_UNIQUE_SPEAKER_QUERY,
    GET_AUDIO_ID_QUERY,
    DEFULT_QUERY_FOR_INSERT_INTO_MAPPING_TABLE,
    GET_SPEAKER_ID_QUERY,
    FETCH_QUERY_WHERE_SPEAKER_IS_NULL,
    DEFAULT_INSERT_QUERY,
    DEFAULT_UPDATE_QUERY_FOR_NORMALIZED_FLAG,
    GET_LOAD_TIME_FOR_AUDIO_QUERY,
    GET_UTTERANCES_LIST_OF_AUDIO_ID,
)
from ekstep_data_pipelines.common import BaseProcessor
from ekstep_data_pipelines.common.utils import get_logger

Logger = get_logger("Audio_cataloguer")


class AudioCataloguer(BaseProcessor):
    """
    docstring cataloguer
    """
    @staticmethod
    def get_instance(data_processor):
        return AudioCataloguer(data_processor)

    def __init__(self, data_processor):

        self.data_processor = data_processor

    def process(self, **kwargs):

Example #27

0

Show file

File: generate_hash.py Project: akbatra567/audio-to-speech-pipeline

import hashlib

from ekstep_data_pipelines.common.utils import get_logger

LOGGER = get_logger("hash_code")


def get_hash_code_of_audio_file(file_path):

    md5_hash = hashlib.md5()
    audio_file = open(file_path, "rb")
    content = audio_file.read()
    md5_hash.update(content)
    digest = md5_hash.hexdigest()
    LOGGER.info("Given file is %s and hash is %s", file_path, digest)
    return digest

Example #28

0

Show file

File: hindi_sanitizer.py Project: TW-Speech/audio-to-speech-pipeline

import re
from ekstep_data_pipelines.audio_transcription.transcription_sanitizers import (
    BaseTranscriptionSanitizer, )
from ekstep_data_pipelines.audio_transcription.transcription_sanitizers.audio_transcription_errors import (
    TranscriptionSanitizationError, )
from ekstep_data_pipelines.common.utils import get_logger

LOGGER = get_logger("HindiTranscriptionSanitizer")


class HindiSanitizer(BaseTranscriptionSanitizer):

    VALID_CHARS = "[ ँ-ःअ-ऋए-ऑओ-नप-रलव-ह़ा-ृे-ॉो-्0-9क़-य़ ॅ]"
    PUNCTUATION = "!\"#%&'()*+,./;<=>?@[\\]^_`{|}~।"

    @staticmethod
    def get_instance(**kwargs):
        return HindiSanitizer()

    def __init__(self, *agrs, **kwargs):
        pass

    def sanitize(self, transcription: str):
        LOGGER.info("Sanitizing transcription:" + transcription)
        transcription = (
            transcription.strip()
        )  # removes spaces from the starting and ending of transcription

        if ":" in transcription:
            raise TranscriptionSanitizationError("transcription has :")

Example #29

0

Show file

from ekstep_data_pipelines.audio_analysis.audio_embeddings.gender_inference import (
    load_model,
    get_prediction_from_npz_file,
)
from ekstep_data_pipelines.common.utils import get_logger

Logger = get_logger("analyse_speakers")


def analyse_gender(embed_file_path):
    Logger.info("Start analyse gender")
    gender_model = load_model(
        "ekstep_data_pipelines/audio_analysis/models/clf_svc.sav")
    file_to_speaker_gender_mapping = get_prediction_from_npz_file(
        gender_model, embed_file_path)
    return file_to_speaker_gender_mapping

Example #30

0

Show file

File: audio_duration.py Project: akbatra567/audio-to-speech-pipeline

import librosa
import sox

from ekstep_data_pipelines.common.utils import get_logger

LOGGER = get_logger("audio_duration")


def calculate_duration(input_filepath):
    duration = sox.file_info.duration(input_filepath)
    LOGGER.info("Duration for input_filepath:%s : %s", input_filepath,
                str(duration))
    return duration


def calculate_duration_librosa(input_filepath):
    y, sr = librosa.load(input_filepath)
    return librosa.get_duration(y)