from ekstep_data_pipelines.common.file_system.gcp_file_systen import GCPFileSystem
from ekstep_data_pipelines.common.utils import get_logger
from ekstep_data_pipelines.common import BaseProcessor

from ekstep_data_pipelines.data_marker.constants import (
    CONFIG_NAME,
    FILTER_CRITERIA,
    LANDING_BASE_PATH,
    SOURCE_BASE_PATH,
)
from ekstep_data_pipelines.data_marker.data_filter import DataFilter
from ekstep_data_pipelines.data_marker.data_mover import MediaFilesMover

ESTIMATED_CPU_SHARE = 0.02

Logger = get_logger("Data marker")


class DataMarker(BaseProcessor):
    """
    1. Load Configuration
    2. Filter data baased on criteria
    2. Tag/Mark data in the DB
    3. Move marked data
    """
    @staticmethod
    def get_instance(data_processor_instance, gcs_instance, **kwargs):
        return DataMarker(data_processor_instance, gcs_instance, **kwargs)

    def __init__(self, postgres_client, gcs_instance, **kwargs):
        self.postgres_client = postgres_client
Example #2
0
import pandas as pd

from ekstep_data_pipelines.common.utils import get_logger

Logger = get_logger("DataFilter")


class DataFilter(object):
    def exclude_audio_ids(self, utterances, audio_ids):
        excluding_audio_ids = filter(lambda t: t[0] not in audio_ids,
                                     utterances)
        return excluding_audio_ids

    def exclude_speaker_ids(self, utterances, speaker_ids):
        excluding_speaker_ids = filter(lambda t: t[0] not in speaker_ids,
                                       utterances)
        return excluding_speaker_ids

    def by_utterance_duration(self, utterances, filters):
        by_utterance_duration = filter(
            lambda t: filters["lte"] >= t[2] >= filters["gte"], utterances)
        return by_utterance_duration

    def by_snr(self, utterances, filters):
        by_snr_utterances = filter(
            lambda t: filters["lte"] >= t[4] >= filters["gte"], utterances)
        return by_snr_utterances

    def by_duration(
        self,
        utterances,
from ekstep_data_pipelines.audio_transcription.transcription_sanitizers import (
    BaseTranscriptionSanitizer, )
from ekstep_data_pipelines.audio_transcription.transcription_sanitizers.audio_transcription_errors import (
    TranscriptionSanitizationError, )

from ekstep_data_pipelines.common.utils import get_logger
import re

LOGGER = get_logger("PunjabiSanitizer")


class PunjabiSanitizer(BaseTranscriptionSanitizer):

    VALID_CHARS = "[  ਼ ਂ ੍ੑ ਾ ਿ ੀ ੁ ੂ ੇ ੈ ੋੰੱਅ-ਊਏ-ਐਓ-ਨਪ-ਰਲਲ਼ਵਸ਼ਸਹਖ਼-ੜਫ਼]+"
    PUNCTUATION = "!\"#%&'()*+,./;<=>?@[\\]^_`{|}~।"

    @staticmethod
    def get_instance(**kwargs):
        return PunjabiSanitizer()

    def __init__(self, *args, **kwargs):
        pass

    def sanitize(self, transcription):
        LOGGER.info("Sanitizing transcription:" + transcription)
        transcription = transcription.strip()

        transcription = self.replace_bad_char(transcription)

        transcription = transcription.strip()
Example #4
0
from ekstep_data_pipelines.audio_processing.generate_hash import (
    get_hash_code_of_audio_file, )
from ekstep_data_pipelines.audio_processing.constants import (
    CONFIG_NAME,
    REMOTE_RAW_FILE,
    CHUNKING_CONFIG,
    SNR_CONFIG,
    REMOTE_PROCESSED_FILE_PATH,
    MASTER_META_DATA_FILE_PATH,
    SNR_DONE_FOLDER_PATH,
    DUPLICATE_AUDIO_FOLDER_PATH,
)
from ekstep_data_pipelines.common.utils import get_logger
from ekstep_data_pipelines.common import BaseProcessor

Logger = get_logger("Audio Processor")


class AudioProcessor(BaseProcessor):
    """
    Class for breaking a downloaded file into smaller chunks of
    audio files as well as filtering out files with more than an acceptable level
    of Sound to Noise Ratio(or SNR)
    """

    DEFAULT_DOWNLOAD_PATH = "/tmp/audio_processing_raw"

    @staticmethod
    def get_instance(data_processor, gcs_instance, audio_commons,
                     catalogue_dao, **kwargs):
        return AudioProcessor(data_processor, gcs_instance, audio_commons,
import os

from google.cloud import speech_v1
from google.cloud.speech_v1 import enums
from ekstep_data_pipelines.common.audio_commons.transcription_clients.transcription_client_errors import (
    GoogleTranscriptionClientError, )
from ekstep_data_pipelines.common.utils import get_logger

LOGGER = get_logger("GoogleTranscriptionClient")


class GoogleTranscriptionClient(object):
    @staticmethod
    def get_instance(config_dict):
        google_config_dict = config_dict.get("common", {}).get(
            "google_transcription_client", {})
        return GoogleTranscriptionClient(**google_config_dict)

    def __init__(self, **config_dict):
        self.language = config_dict.get("language", "hi-IN")
        self.sample_rate = config_dict.get("sample_rate", 16000)
        self.channels = config_dict.get("audio_channel_count", 1)
        self.bucket = config_dict.get("bucket")
        self._client = None

    def make_directories(self, path):
        if not os.path.exists(path):
            LOGGER(f"Directory {path} not does already exist")
            os.makedirs(path)
            LOGGER.info("Directory %s created successfully", path)
from ekstep_data_pipelines.audio_transcription.transcription_sanitizers import (
    BaseTranscriptionSanitizer, )
from ekstep_data_pipelines.audio_transcription.transcription_sanitizers.audio_transcription_errors import (
    TranscriptionSanitizationError, )

from ekstep_data_pipelines.common.utils import get_logger
import re

LOGGER = get_logger("KannadaTranscriptionSanitizer")


class KannadaSanitizer(BaseTranscriptionSanitizer):

    VALID_CHARS = "[ ಂ-ಃಅ-ಋಎ-ಐಒ-ನಪ-ರಲ-ಳವ-ಹಾ-ೄೆ-ೈೊ-್ೲ]+"
    PUNCTUATION = "!\"#%&'()*+,./;<=>?@[\\]^_`{|}~।"

    @staticmethod
    def get_instance(**kwargs):
        return KannadaSanitizer()

    def __init__(self, *args, **kwargs):
        pass

    def sanitize(self, transcription):
        LOGGER.info("Sanitizing transcription:" + transcription)
        transcription = transcription.strip()

        transcription = self.replace_bad_char(transcription)

        transcription = transcription.strip()
from ekstep_data_pipelines.common.utils import get_logger
from ekstep_data_pipelines.audio_analysis.speaker_analysis.speaker_clustering import (
    create_speaker_clusters, )
from ekstep_data_pipelines.audio_analysis.speaker_analysis.file_cluster_mapping import (
    speaker_to_file_name_map, )

Logger = get_logger("AudioSpeakerClusteringProcessor")


def create_embeddings(
    local_audio_download_path,
    dir_pattern,
    embed_file_path,
    fs_interface,
    npz_bucket_destination_path,
    source_name,
):
    is_uploaded = fs_interface.upload_to_location(embed_file_path,
                                                  npz_bucket_destination_path)
    if is_uploaded:
        Logger.info("npz file uploaded to :" + npz_bucket_destination_path)
    else:
        Logger.info("npz file could not be uploaded to :" +
                    npz_bucket_destination_path)
    file_map_dict, noise_file_map_dict = create_speaker_clusters(
        embed_file_path, source_name)
    speaker_to_file_name = speaker_to_file_name_map(file_map_dict)
    Logger.info("total speakers:" + str(len(speaker_to_file_name)))
Example #8
0
import re

from ekstep_data_pipelines.audio_transcription.transcription_sanitizers import (
    BaseTranscriptionSanitizer, )
from ekstep_data_pipelines.audio_transcription.transcription_sanitizers.audio_transcription_errors import (
    TranscriptionSanitizationError, )
from ekstep_data_pipelines.common.utils import get_logger

LOGGER = get_logger("MalayalamTranscriptionSanitizer")


class MalayalamSanitizer(BaseTranscriptionSanitizer):
    VALID_CHARS = "[ ം-ഃഅ-ഋഎ-ഐഒ-നപ-ഺാ-ൃെ-ൈൊ-്ൺ-ൾ]+"
    PUNCTUATION = "!\"#%&'()*+,./;<=>?@[\\]^_`{|}~।"

    @staticmethod
    def get_instance(**kwargs):
        return MalayalamSanitizer()

    def __init__(self, *args, **kwargs):
        pass

    def sanitize(self, transcription):
        LOGGER.info("Sanitizing transcription:%s", transcription)
        transcription = transcription.strip()

        transcription = self.replace_bad_char(transcription)

        transcription = transcription.strip()

        if len(transcription) == 0:
Example #9
0
# import signal
import sys
import multiprocessing
import os

from concurrent.futures import ThreadPoolExecutor

from ekstep_data_pipelines.common.utils import get_logger
from ekstep_data_pipelines.common import BaseProcessor
from ekstep_data_pipelines.audio_embedding.create_embeddings import (
    encode_each_batch)

LOGGER = get_logger("AudioEmbeddingProcessor")

ESTIMATED_CPU_SHARE = 0.1


class AudioEmbedding(BaseProcessor):
    """
    Class to identify speaker for each utterance in a source
    """

    local_txt_path = "./audio_speaker_cluster/file_path/"
    local_audio_path = "./audio_speaker_cluster/audio_files/"
    embed_file_path = "./audio_speaker_cluster/embed_file_path/"

    @staticmethod
    def get_instance(data_processor, **kwargs):
        return AudioEmbedding(data_processor, **kwargs)

    def __init__(self, data_processor, **kwargs):
Example #10
0
from ekstep_data_pipelines.audio_transcription.transcription_sanitizers import (
    BaseTranscriptionSanitizer, )
from ekstep_data_pipelines.audio_transcription.transcription_sanitizers.audio_transcription_errors import (
    TranscriptionSanitizationError, )

from ekstep_data_pipelines.common.utils import get_logger
import re

LOGGER = get_logger("IndianEnglishSanitizer")


class IndianEnglishSanitizer(BaseTranscriptionSanitizer):

    VALID_CHARS = "[ a-zA-Z0-9']"
    PUNCTUATION = '!"#%&()*+,./;<=>?@[\\]^_`{|}~ред'

    @staticmethod
    def get_instance(**kwargs):
        return IndianEnglishSanitizer()

    def __init__(self, *args, **kwargs):
        pass

    def sanitize(self, transcription):
        LOGGER.info("Sanitizing transcription:" + transcription)
        transcription = transcription.strip()

        transcription = self.replace_bad_char(transcription)

        transcription = transcription.strip()
Example #11
0
import pandas as pd

from ekstep_data_pipelines.common.utils import get_logger

from sqlalchemy import text

from ekstep_data_pipelines.common.dao.constants import (
    GET_UNIQUE_ID,
    IS_EXIST,
    COMMAND_WITH_LICENSE,
    COMMAND_WITHOUT_LICENSE,
    LICENSE,
)

LOGGER = get_logger("CatalogueDao")


class CatalogueDao:
    def __init__(self, postgres_client):
        self.postgres_client = postgres_client

    def get_utterances(self, audio_id):
        parm_dict = {"audio_id": audio_id}
        utterances = self.postgres_client.execute_query(
            "select utterances_files_list from media_metadata_staging where audio_id = :audio_id",
            **parm_dict,
        )
        return json.loads(utterances[0][0]) if len(utterances) > 0 else []

    def get_utterances_by_source(self, source, status):
Example #12
0
import re

from ekstep_data_pipelines.audio_transcription.transcription_sanitizers import (
    BaseTranscriptionSanitizer, )
from ekstep_data_pipelines.audio_transcription.transcription_sanitizers.audio_transcription_errors import (
    TranscriptionSanitizationError, )
from ekstep_data_pipelines.common.utils import get_logger

LOGGER = get_logger("UrduTranscriptionSanitizer")


class UrduSanitizer(BaseTranscriptionSanitizer):
    VALID_CHARS = "[ ء-آؤئ-بت-غف-قل-نؤٹپچڈڑژکگںھہیے-ۓ]+"
    PUNCTUATION = "!\"#%&'()*+,./;<=>?@[\\]^_`{|}~।"

    @staticmethod
    def get_instance(**kwargs):
        return UrduSanitizer()

    def __init__(self, *args, **kwargs):
        pass

    def sanitize(self, transcription):
        LOGGER.info("Sanitizing transcription:%s", transcription)
        transcription = transcription.strip()

        transcription = self.replace_bad_char(transcription)

        transcription = transcription.strip()

        if len(transcription) == 0:
Example #13
0
import json
import os
import shutil
import subprocess

import pandas as pd
from ekstep_data_pipelines.audio_language_identification.audio_language_inference import (
    infer_language,
)
from ekstep_data_pipelines.audio_processing.audio_duration import calculate_duration
from ekstep_data_pipelines.common.utils import get_logger

LOGGER = get_logger("Snr")


class SNR:
    """
    Util object for performing SNR analysis over different
    """

    MAX_DURATION = 15

    @staticmethod
    def get_instance(initialization_dict):
        feat_language_identification = initialization_dict.get(
            "audio_processor_config", {}
        ).get("feat_language_identification", False)
        LOGGER.info(
            "Running with feat_language_identification=%s",
            str(feat_language_identification),
        )
from ekstep_data_pipelines.audio_transcription.transcription_sanitizers import (
    BaseTranscriptionSanitizer, )
from ekstep_data_pipelines.common.utils import get_logger

LOGGER = get_logger("GujratiTranscriptionSanitizer")


class GujratiSanitizer(BaseTranscriptionSanitizer):
    @staticmethod
    def get_instance(**kwargs):
        return GujratiSanitizer()

    def __init__(self, *args, **kwargs):
        pass

    def sanitize(self, transcription):
        pass
from concurrent.futures import ThreadPoolExecutor

from ekstep_data_pipelines.common.utils import get_logger

Logger = get_logger("MediaFilesMover")


class MediaFilesMover(object):
    def __init__(self, file_system, concurrency):
        self.file_system = file_system
        self.concurrency = concurrency

    def move_media_files(self, files, landing_path_with_source):
        Logger.info("using concurrency:%s", str(self.concurrency))
        worker_pool = ThreadPoolExecutor(max_workers=self.concurrency)
        for file in files:
            relative_audio_id_clean_path = "/".join(file.split("/")[-3:-1])
            landing_path = f"{landing_path_with_source}/{relative_audio_id_clean_path}"
            worker_pool.submit(self.file_system.mv_file, file, landing_path)
        worker_pool.shutdown(wait=True)
Example #16
0
import re

from ekstep_data_pipelines.audio_transcription.transcription_sanitizers import (
    BaseTranscriptionSanitizer, )
from ekstep_data_pipelines.audio_transcription.transcription_sanitizers.audio_transcription_errors import (
    TranscriptionSanitizationError, )
from ekstep_data_pipelines.common.utils import get_logger

LOGGER = get_logger("BengaliTranscriptionSanitizer")


class BengaliSanitizer(BaseTranscriptionSanitizer):
    VALID_CHARS = "[ ঁ-ঃঅ-ঋএ-ঐও-নপ-রলশ-হ়া-্ে-ৈো-ৎয়]+"
    PUNCTUATION = "!\"#%&'()*+,./;<=>?@[\\]^_`{|}~।"

    @staticmethod
    def get_instance(**kwargs):
        return BengaliSanitizer()

    def __init__(self, *args, **kwargs):
        pass

    def sanitize(self, transcription):
        LOGGER.info("Sanitizing transcription:%s", transcription)
        transcription = transcription.strip()

        transcription = self.replace_bad_char(transcription)

        transcription = transcription.strip()

        if len(transcription) == 0:
Example #17
0
import glob
import os
import subprocess
import collections
import contextlib
import sys
import wave
import webrtcvad
import sox

from ekstep_data_pipelines.common.utils import get_logger

Logger = get_logger("Chunking Util")


class ChunkingConversionUtil:

    re_chunking_aggressiveness = 3

    @staticmethod
    def get_instance():
        return ChunkingConversionUtil()

    def convert_to_wav(self, input_dir, output_dir=None, ext="mp4"):

        Logger.info(f"Convert all the files in {input_dir} to wav")
        audio_paths = glob.glob(input_dir + "/*." + ext)

        Logger.info(f"Files to be completed: {audio_paths}")

        if len(audio_paths) < 1:
Example #18
0

class ACTIONS:
    DATA_MARKING = "data_marking"
    AUDIO_PROCESSING = "audio_processing"
    AUDIO_TRANSCRIPTION = "audio_transcription"
    AUDIO_ANALYSIS = "audio_analysis"
    AUDIO_CATALOGUER = "audio_cataloguer"


class FILE_SYSTEMS:
    GOOGLE = "google"
    LOCAL = "local"


LOGGER = get_logger("EKSTEP_PROCESSOR")
ACTIONS_LIST = [
    ACTIONS.DATA_MARKING,
    ACTIONS.AUDIO_PROCESSING,
    ACTIONS.AUDIO_TRANSCRIPTION,
    ACTIONS.AUDIO_ANALYSIS,
    ACTIONS.AUDIO_CATALOGUER,
]
FILES_SYSTEMS_LIST = [FILE_SYSTEMS.GOOGLE, FILE_SYSTEMS.LOCAL]
# config_bucket = 'ekstepspeechrecognition-dev'

parser = argparse.ArgumentParser(
    description="Util for data processing for EkStep")

parser.add_argument(
    "-b",
Example #19
0
from azure.cognitiveservices import speech
from ekstep_data_pipelines.common.utils import get_logger
from ekstep_data_pipelines.common.audio_commons.transcription_clients.transcription_client_errors import (
    AzureTranscriptionClientError, )

LOGGER = get_logger("AzureTranscriptionClient")


class AzureTranscriptionClient(object):
    @staticmethod
    def get_instance(config_dict):
        azure_config_dict = config_dict.get("common", {}).get(
            "azure_transcription_client", {})
        return AzureTranscriptionClient(**azure_config_dict)

    def __init__(self, **kwargs):
        self.speech_key = kwargs.get("speech_key")
        self.service_region = kwargs.get("service_region")
        self.language = kwargs.get("language", "hi-IN")
        self.speech_config = speech.SpeechConfig(subscription=self.speech_key,
                                                 region=self.service_region)

    def generate_transcription(self, language, source_file_path):
        try:
            result = self.speech_to_text(source_file_path)
        except RuntimeError as error:
            raise AzureTranscriptionClientError(error)
        return result.text

    def speech_to_text(self, audio_file_path):
        audio_input = speech.audio.AudioConfig(filename=audio_file_path)
Example #20
0
import os
from os import listdir
from os.path import isfile, join
from google.cloud import storage
from ekstep_data_pipelines.common.infra_commons.storage import BaseStorageInterface
from concurrent.futures import ThreadPoolExecutor
from ekstep_data_pipelines.common.infra_commons.storage.exceptions import (
    FileNotFoundException,
    PathDoesNotExist,
)
from ekstep_data_pipelines.common.utils import get_logger
from tqdm import tqdm

Logger = get_logger("GoogleStorage")


class GoogleStorage(BaseStorageInterface):
    def __init__(self, **kwargs):
        self._client = None

    def get_bucket_from_path(self, path) -> str:
        if not path:
            return None

        splitted_path = list(filter(None, path.split("/")))

        if len(splitted_path) < 1:
            return None

        return splitted_path[0]
from ekstep_data_pipelines.common.utils import get_logger

Logger = get_logger("GCPFileSystem")


class GCPFileSystem:
    def __init__(self, gcp_operations):
        self.gcp_operations = gcp_operations

    def ls(self, dir_path):
        paths = self.gcp_operations.list_blobs_in_a_path(dir_path)
        return list(map(lambda p: p.name, paths))

    def mv(self, source_dir, target_dir, is_dir=True):
        if is_dir and not self.gcp_operations.check_path_exists(
                self, source_dir):
            Logger.info("source dir does not exist:%s", source_dir)
            return

        files = self.ls(source_dir)
        for file in files:
            self.mv_file(file, target_dir)

    def mv_file(self, file, target_dir):
        paths = file.split("/")
        paths.pop()
        source_dir = "/".join(paths)
        destination_blob_name = file.replace(source_dir, target_dir)
        Logger.info("Moving file %s --> %s", file, destination_blob_name)
        self.gcp_operations.move_blob(file, destination_blob_name)
import re

from ekstep_data_pipelines.audio_transcription.transcription_sanitizers import (
    BaseTranscriptionSanitizer,
)
from ekstep_data_pipelines.audio_transcription.transcription_sanitizers.audio_transcription_errors import (
    TranscriptionSanitizationError,
)
from ekstep_data_pipelines.common.utils import get_logger

LOGGER = get_logger("TeluguTranscriptionSanitizer")


class TeluguSanitizer(BaseTranscriptionSanitizer):
    VALID_CHARS = "[ ం-ఃఅ-ఌఎ-ఐఒ-నప-ళవ-హా-ౄె-ైొ-్ౠ]+"
    PUNCTUATION = "!\"#%&'()*+,./;<=>?@[\\]^_`{|}~।"

    @staticmethod
    def get_instance(**kwargs):
        return TeluguSanitizer()

    def __init__(self, *args, **kwargs):
        pass

    def sanitize(self, transcription):
        LOGGER.info("Sanitizing transcription:%s", transcription)
        transcription = transcription.strip()

        transcription = self.replace_bad_char(transcription)

        transcription = transcription.strip()
Example #23
0
import multiprocessing
import os
import yaml
import shutil, glob
from os import listdir
from os.path import isfile, join
from google.cloud import storage
from concurrent.futures import ThreadPoolExecutor
import datetime
from ekstep_data_pipelines.common.utils import get_logger

Logger = get_logger("GCS Operations")


class CloudStorageOperations:
    @staticmethod
    def get_instance(config_dict, **kwargs):
        gcs_instance = CloudStorageOperations(config_dict, **kwargs)
        return gcs_instance

    def __init__(self, config_dict, **kwargs):
        self.config_dict = config_dict
        self._bucket = None
        self._client = None

    @property
    def client(self):
        if self._client:
            return self._client

        self._client = storage.Client()
Example #24
0
import re

from ekstep_data_pipelines.audio_transcription.transcription_sanitizers import (
    BaseTranscriptionSanitizer,
)
from ekstep_data_pipelines.audio_transcription.transcription_sanitizers.audio_transcription_errors import (
    TranscriptionSanitizationError,
)
from ekstep_data_pipelines.common.utils import get_logger

LOGGER = get_logger("AssameseTranscriptionSanitizer")


class AssameseSanitizer(BaseTranscriptionSanitizer):
    VALID_CHARS = "[ ঁ-ঃঅ-ঋএ-ঐও-চচ-নপ-যলশ-হা-ৃে-ৈো-ৎৗড়-ঢ়য়-ৠৰ-ৱ৺]+"
    PUNCTUATION = "!\"#%&'()*+,./;<=>?@[\\]^_`{|}~।"

    @staticmethod
    def get_instance(**kwargs):
        return AssameseSanitizer()

    def __init__(self, *args, **kwargs):
        pass

    def sanitize(self, transcription):
        LOGGER.info("Sanitizing transcription:%s", transcription)
        transcription = transcription.strip()

        transcription = self.replace_bad_char(transcription)

        transcription = transcription.strip()
Example #25
0
    AUDIO_LANGUAGE,
)
from ekstep_data_pipelines.audio_transcription.transcription_sanitizers.audio_transcription_errors import (
    TranscriptionSanitizationError, )
from ekstep_data_pipelines.audio_transcription.transcription_sanitizers import (
    get_transcription_sanitizers, )
from ekstep_data_pipelines.common.audio_commons.transcription_clients.transcription_client_errors import (
    AzureTranscriptionClientError,
    GoogleTranscriptionClientError,
)
from ekstep_data_pipelines.common.file_utils import get_file_name
from ekstep_data_pipelines.common.utils import get_logger
from ekstep_data_pipelines.common import BaseProcessor
import os

LOGGER = get_logger("audio_transcription")


class AudioTranscription(BaseProcessor):
    LOCAL_PATH = None

    @staticmethod
    def get_instance(data_processor, gcs_instance, audio_commons,
                     catalogue_dao, **kwargs):
        return AudioTranscription(data_processor, gcs_instance, audio_commons,
                                  catalogue_dao, **kwargs)

    def __init__(self, data_processor, gcs_instance, audio_commons,
                 catalogue_dao, **kwargs):
        self.data_processor = data_processor
        self.gcs_instance = gcs_instance
Example #26
0
    MAX_LOAD_DATE_FOR_MEDIA_QUERY,
    INSERT_INTO_MEDIA_TABLE_QUERY,
    INSERT_UNIQUE_SPEAKER_QUERY,
    GET_AUDIO_ID_QUERY,
    DEFULT_QUERY_FOR_INSERT_INTO_MAPPING_TABLE,
    GET_SPEAKER_ID_QUERY,
    FETCH_QUERY_WHERE_SPEAKER_IS_NULL,
    DEFAULT_INSERT_QUERY,
    DEFAULT_UPDATE_QUERY_FOR_NORMALIZED_FLAG,
    GET_LOAD_TIME_FOR_AUDIO_QUERY,
    GET_UTTERANCES_LIST_OF_AUDIO_ID,
)
from ekstep_data_pipelines.common import BaseProcessor
from ekstep_data_pipelines.common.utils import get_logger

Logger = get_logger("Audio_cataloguer")


class AudioCataloguer(BaseProcessor):
    """
    docstring cataloguer
    """
    @staticmethod
    def get_instance(data_processor):
        return AudioCataloguer(data_processor)

    def __init__(self, data_processor):

        self.data_processor = data_processor

    def process(self, **kwargs):
import hashlib

from ekstep_data_pipelines.common.utils import get_logger

LOGGER = get_logger("hash_code")


def get_hash_code_of_audio_file(file_path):

    md5_hash = hashlib.md5()
    audio_file = open(file_path, "rb")
    content = audio_file.read()
    md5_hash.update(content)
    digest = md5_hash.hexdigest()
    LOGGER.info("Given file is %s and hash is %s", file_path, digest)
    return digest
import re
from ekstep_data_pipelines.audio_transcription.transcription_sanitizers import (
    BaseTranscriptionSanitizer, )
from ekstep_data_pipelines.audio_transcription.transcription_sanitizers.audio_transcription_errors import (
    TranscriptionSanitizationError, )
from ekstep_data_pipelines.common.utils import get_logger

LOGGER = get_logger("HindiTranscriptionSanitizer")


class HindiSanitizer(BaseTranscriptionSanitizer):

    VALID_CHARS = "[ ँ-ःअ-ऋए-ऑओ-नप-रलव-ह़ा-ृे-ॉो-्0-9क़-य़ ॅ]"
    PUNCTUATION = "!\"#%&'()*+,./;<=>?@[\\]^_`{|}~।"

    @staticmethod
    def get_instance(**kwargs):
        return HindiSanitizer()

    def __init__(self, *agrs, **kwargs):
        pass

    def sanitize(self, transcription: str):
        LOGGER.info("Sanitizing transcription:" + transcription)
        transcription = (
            transcription.strip()
        )  # removes spaces from the starting and ending of transcription

        if ":" in transcription:
            raise TranscriptionSanitizationError("transcription has :")
Example #29
0
from ekstep_data_pipelines.audio_analysis.audio_embeddings.gender_inference import (
    load_model,
    get_prediction_from_npz_file,
)
from ekstep_data_pipelines.common.utils import get_logger

Logger = get_logger("analyse_speakers")


def analyse_gender(embed_file_path):
    Logger.info("Start analyse gender")
    gender_model = load_model(
        "ekstep_data_pipelines/audio_analysis/models/clf_svc.sav")
    file_to_speaker_gender_mapping = get_prediction_from_npz_file(
        gender_model, embed_file_path)
    return file_to_speaker_gender_mapping
import librosa
import sox

from ekstep_data_pipelines.common.utils import get_logger

LOGGER = get_logger("audio_duration")


def calculate_duration(input_filepath):
    duration = sox.file_info.duration(input_filepath)
    LOGGER.info("Duration for input_filepath:%s : %s", input_filepath,
                str(duration))
    return duration


def calculate_duration_librosa(input_filepath):
    y, sr = librosa.load(input_filepath)
    return librosa.get_duration(y)