Python write Examples, tscribe.write Python Examples

Example #1

0

Show file

def test_save_as(input_file, output_format, location):
    """
    Test saving of supported formats to locations

    GIVEN locations of current or specific folder
    WHEN writing transcript in any supported format
    THEN check output exists
    """

    logging.info("test_save_as")

    if not Path("output").is_dir():
        os.mkdir("output")

    # GIVEN locations of current or specific folder
    output_filename = Path(location) / Path(Path(input_file).parts[-1]).with_suffix(
        f".{output_format}"
    )

    # WHEN writing transcript in any supported format
    tscribe.write(input_file, format=output_format, save_as=output_filename)

    # THEN check output exists
    assert output_filename.is_file()

    os.remove(output_filename)

Example #2

0

Show file

File: transcribe_script.py Project: jsavage/python_aws_transcribe_one_by_one

    def export_files(self):
        """
        Export all the resulted JSON file(s) as Word docx using Tscribe and archive the source files in 'Archive' folder. 
        """
        try:
            logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')

            bucket = self.s3_resource.Bucket(self.output_bucket_name)
            archive_path = config['file_paths']['archive_path']

            for obj in bucket.objects.filter(Delimiter='/'):
                obj_name, obj_extn = os.path.splitext(obj.key)
                if obj_extn == '.json':
                    self.s3_resource.meta.client.download_file(self.output_bucket_name, obj.key, os.path.join(self.output_path, obj.key))
                    file_content = obj.get()['Body'].read().decode('utf-8')
                    json_content = json.loads(file_content)
                    if json_content['results']['transcripts'][0]['transcript'] != "" :
                        json_file_path = os.path.join(self.output_path, obj.key)
                        save_as_path = os.path.join(self.output_path, obj_name +'.docx')
                        tscribe.write(json_file_path, format="docx", save_as= save_as_path)
                        self.archive_object(archive_path, '', '', obj.key)
                        
        except ClientError:
            logger.exception("Failed to export files.")
            raise

Example #3

0

Show file

def test_write_to_sqlite(input_file):
    """
    Test production of sqlite output

    GIVEN an input file
    WHEN writing to sqlite
    THEN check output exists and contains content
    """

    logging.info("test_write_to_sqlite")

    # GIVEN an input file
    # WHEN writing to sqlite
    output_filename = Path(f"{uuid4().hex}.db")
    tscribe.write(input_file, save_as=output_filename, format="sqlite")

    # THEN check output exists and contains content
    assert output_filename.is_file(), "Output file should exist"

    conn = sqlite3.connect(str(output_filename))
    c = conn.cursor()
    c.execute("SELECT * FROM transcript")
    query = c.fetchall()

    data = tscribe.load_json_as_dict(input_file)
    df = tscribe.decode_transcript_to_dataframe(data)

    assert len(query) == len(df), "Database table should be length of dataframe"

    # Teardown
    os.remove(output_filename)

Example #4

0

Show file

def test_write_to_csv(input_file):
    """
    Test production of csv output

    GIVEN an input file
    WHEN writing to csv
    THEN check output exists and contains content
    """

    logging.info("test_write_to_csv")

    # GIVEN an input file
    # WHEN writing to csv
    output_filename = Path(f"{uuid4().hex}.csv")
    tscribe.write(input_file, save_as=output_filename, format="csv")

    # THEN check output exists and contains content
    assert output_filename.is_file(), "Output file should exist"

    with open(output_filename, "r") as file:
        lines = file.readlines()

    data = tscribe.load_json_as_dict(input_file)
    df = tscribe.decode_transcript_to_dataframe(data)

    assert len(lines) == len(df) + 1, "CSV should be length of dataframe + headers"

    # Teardown
    os.remove(output_filename)

Example #5

0

Show file

def test_write_to_docx(input_file):
    """
    Test production of docx output

    GIVEN an input file
    WHEN writing to docx
    THEN check output exists and contains content
    """

    logging.info("test_write_to_docx")

    # GIVEN an input file
    # WHEN writing to docx
    output_filename = Path(f"{uuid4().hex}.docx")
    tscribe.write(input_file, save_as=output_filename, format="docx")

    # THEN check output exists and contains content
    assert output_filename.is_file(), "Output file should exist"

    document = Document(output_filename)

    assert (
        len(document.tables) == 2
    ), "Document should contain two tables, stats and transcript"

    t_conf = document.tables[0].cell(0, 0).text
    t_count = document.tables[0].cell(0, 1).text
    t_perc = document.tables[0].cell(0, 2).text
    assert (t_conf, t_count, t_perc) == (
        "Confidence",
        "Count",
        "Percentage",
    ), "First table should be stats headers"
    assert len(document.tables[0].rows) == 12, "Stats table should hold 12 rows"

    t_time = document.tables[1].cell(0, 0).text
    t_speaker = document.tables[1].cell(0, 1).text
    t_content = document.tables[1].cell(0, 2).text
    assert (t_time, t_speaker, t_content) == (
        "Time",
        "Speaker",
        "Content",
    ), "Second table should be transcript headers"
    data = tscribe.load_json_as_dict(input_file)
    df = tscribe.decode_transcript_to_dataframe(data)
    assert (
        len(document.tables[1].rows) == len(df) + 1
    ), "Second table should be length of dataframe + headers"

    assert (
        "chart.png" in document.paragraphs[6]._p.xml
    ), "Chart should be in paragraph six"

    # Teardown
    os.remove(output_filename)

Example #6

0

Show file

File: index.py Project: mikemountjoy99/aws-transcribe-on-s3-upload

def handler(event, context):
    transcribe = boto3.client('transcribe', region_name='eu-west-2')
    for record in event['Records']:
        x = datetime.datetime.now()

        s3_bucket = record['s3']['bucket']['name']
        s3_filekey = unquote_plus(record['s3']['object']['key'])
        job_name = "Transcribe-Video-" + x.strftime("%Y-%m-%d-%H-%M-%S")
        job_uri = "s3://" + s3_bucket + "/" + s3_filekey
        print(job_uri)

        transcribe.start_transcription_job(TranscriptionJobName=job_name,
                                           Media={'MediaFileUri': job_uri},
                                           MediaFormat='mp4',
                                           LanguageCode='en-US')

        while True:
            status = transcribe.get_transcription_job(
                TranscriptionJobName=job_name)
            if status['TranscriptionJob']['TranscriptionJobStatus'] in [
                    'COMPLETED', 'FAILED'
            ]:
                break
            print("Transcribing...")
            time.sleep(5)

        s3_client = boto3.client('s3')
        try:
            local_json_file = '/tmp/local_saved_file'
            urllib.request.urlretrieve(
                status['TranscriptionJob']['Transcript']['TranscriptFileUri'],
                local_json_file)
            bucket = os.getenv('ENV_S3BUCKET')
            object_json_name = s3_filekey + '-transcript.json'
            response = s3_client.upload_file(local_json_file, bucket,
                                             object_json_name)

            # convert json to docx
            object_docx_name = s3_filekey + "-transcript.docx"
            local_docx_file = '/tmp/' + object_docx_name
            tmp_dir = '/tmp/'
            tscribe.write(local_json_file,
                          save_as=local_docx_file,
                          tmp_dir=tmp_dir)

            #upload docx to s3
            response = s3_client.upload_file(local_docx_file, bucket,
                                             object_docx_name)

        except ClientError as e:
            logging.error(e)
            return False

    return "Complete"

Example #7

0

Show file

def test_depricated_tmp_dir(input_file):
    """
    Test that using tmp_dir fails

    GIVEN an input file
    WHEN calling tscribe with tmp_dir
    THEN receive warning and fail
    """

    logging.info("test_deprecated_tmp_dir")

    # GIVEN an input file
    # WHEN calling tscribe with tmp_dir
    # THEN receive warning and fail
    tscribe.write(input_file, tmp_dir=".")

Example #8

0

Show file

def test_unrecognised_output_format(input_file):
    """
    Test for exception when given unrecognised output format

    GIVEN an input file and an unrecognised output format
    WHEN calling tscribe.write(...)
    THEN xfail
    """

    # GIVEN an input file and an unrecognised output format
    unrecognised_format = "unrecognised"

    # WHEN calling tscribe.write(...)
    # THEN xfail
    tscribe.write(input_file, format=unrecognised_format)

Example #9

0

Show file

File: test_tscribe.py Project: varunkhanna1993/aws_transcribe_to_docx

def test_write_to_default(input_file):
    """
    Test production of default output
    
    GIVEN an input file
    WHEN not specifying output
    THEN check output is the default format
    """

    # GIVEN an input file
    # WHEN not specifying output
    tscribe.write(input_file)
    expected_filename = input_file.replace(".json", ".docx")
    output_filename = Path(expected_filename)

    # THEN check output exists and contains content
    assert output_filename.is_file(), "Output file should exist"

    # Teardown
    os.remove(output_filename)

Example #10

0

Show file

File: vggknot.py Project: mre500/jiang-jung-dian

def replaceName(Speaker_IDs, speakers):
    # Obtain the latest json files
    json_files = glob.glob('data/transcribe/*.json')
    latest_json = max(json_files, key=os.path.getctime)

    # Replace speakers wiopenth Speaker_IDs in the json file
    with open(latest_json, 'r', encoding='utf-8') as f:
        json_file = json.load(f)
        txt = json.dumps(json_file,)
    for i in range(len(speakers)):
        txt = txt.replace(Speaker_IDs[i], speakers[i])
    print(txt)

    new_path = latest_json.replace('transcribe', 'report')
    with open(new_path, 'w') as f:
        f.write(txt)

    tscribe.write(new_path,
                  format="csv",
                  save_as=new_path.replace('.json', '.csv'))
    os.remove(new_path)

Example #11

0

Show file

def test_write_to_vtt(input_file):
    """
    Test production of vtt format

    GIVEN an input file
    WHEN writing to vtt
    THEN check output exists and contains content
    """

    logging.info("test_write_to_vtt")

    # GIVEN an input file
    # WHEN writing to vtt
    output_filename = Path(f"{uuid4().hex}.vtt")
    tscribe.write(input_file, save_as=output_filename, format="vtt")

    # THEN check output exists and contains content
    vtt = webvtt.read(output_filename)

    data = tscribe.load_json_as_dict(input_file)
    df = tscribe.decode_transcript_to_dataframe(data)
    assert len(vtt.captions) == len(
        df
    ), "vtt file should have equal captions to df rows"

    for caption in vtt.captions:

        assert hasattr(caption, "start"), "each caption should have a start_time"
        assert hasattr(caption, "end"), "each caption should have a end_time"
        assert hasattr(caption, "text"), "each caption should have text"
        assert (
            len(caption.lines) >= len(caption.text) / 80
        ), "text should be split into max 80 long lines"
        if input_file != "sample_single.json":
            assert hasattr(
                caption, "identifier"
            ), "each caption should have an identifier"

    # Teardown
    os.remove(output_filename)

Example #12

0

Show file

File: test_legacy.py Project: affix/aws_transcribe_to_docx

def test_single_speaker():
    """
    Test output exists with single speaker input

    # GIVEN a sample file containing single speaker
    # WHEN calling tscribe.write(...)
    # THEN produce the .docx without errors

    """

    # Setup
    input_file = "sample_single.json"
    output_file = "sample_single.docx"
    assert os.access(input_file, os.F_OK), "Input file not found"

    # Function
    tscribe.write(input_file)
    assert os.access(output_file, os.F_OK), "Output file not found"

    # Teardown
    os.remove(output_file)
    os.remove("chart.png")

Example #13

0

Show file

def test_multiple_speakers():
    """
    Test output exists with multiple speaker input

    # GIVEN a sample file containing multiple speakers
    # WHEN calling tscribe.write(...)
    # THEN produce the .docx without errors

    """

    # Setup
    input_file = "sample_material/03-speaker-identification.json"
    output_file = "sample_material/03-speaker-identification.docx"
    assert os.access(input_file, os.F_OK), "Input file not found"

    # Function
    tscribe.write(input_file)
    assert os.access(output_file, os.F_OK), "Output file not found"

    # Teardown
    os.remove(output_file)
    os.remove("sample_material/chart.png")

Example #14

0

Show file

File: test_legacy.py Project: affix/aws_transcribe_to_docx

def test_multiple_speakers_with_save_as():
    """
    Test output exists with multiple speaker input, and save_as defined

    # GIVEN a sample file containing multiple speakers, and an output filename
    # WHEN calling tscribe.write(...)
    # THEN produce the .docx, named correctly, without errors

    """

    # Setup
    input_file = "sample_multiple.json"
    output_file = "test_sample.docx"
    assert os.access(input_file, os.F_OK), "Input file not found"

    # Function
    tscribe.write(input_file, save_as=output_file)
    assert os.access(output_file, os.F_OK), "Output file not found"

    # Teardown
    os.remove(output_file)
    os.remove("chart.png")

Example #15

0

Show file

File: test_legacy.py Project: affix/aws_transcribe_to_docx

def test_multiple_speakers_with_save_as_with_tmp_dir():
    """
    Test output exists with multiple speaker input, and save_as defined, and tmp_dir defined

    # GIVEN a sample file containing multiple speakers, and an output filename, and a writable tmp directory
    # WHEN calling tscribe.write(...)
    # THEN produce the .docx, with a chart, named correctly, without errors

    """

    # Setup
    input_file = "sample_multiple.json"
    output_file = "test_sample.docx"
    tmp_dir = "/tmp/"
    assert os.access(input_file, os.F_OK), "Input file not found"

    # Function
    tscribe.write(input_file, save_as=output_file, tmp_dir=tmp_dir)
    assert os.access(tmp_dir + "chart.png", os.F_OK), "Chart file not found"
    assert os.access(output_file, os.F_OK), "Output file not found"

    # Teardown
    os.remove(output_file)
    os.remove(tmp_dir + "chart.png")

Example #16

0

Show file

def test_save_as(input_file, output_format, location):
    """
    Test saving of supported formats to locations
    
    GIVEN locations of current or specific folder
    WHEN writing transcript in any supported format
    THEN check output exists
    """

    if not Path("output").is_dir():
        os.mkdir("output")

    # GIVEN locations of current or specific folder
    output_filename = Path(location) / Path(
        input_file.replace(".json", f".{output_format}")
    )

    # WHEN writing transcript in any supported format
    tscribe.write(input_file, format=output_format, save_as=output_filename)

    # THEN check output exists
    assert output_filename.is_file()

    os.remove(output_filename)

Example #17

0

Show file

File: aws_tscribe.py Project: joseph9991/Milestone1

import tscribe
import sys

file_name = sys.argv[1]
tscribe.write(file_name, format="csv", save_as="1.csv")

Example #18

0

Show file

File: make_readable.py Project: jphaugla/awsTranscribeBasics

import tscribe
import os

numargs = len(sys.argv)

print("Number of arguments:" + str(numargs))
print(" arguments " + str(sys.argv))
if numargs > 1:
    usage_demo(numargs, sys.argv)
else:
   print("must pass directory path for json files to walk")
   exit()

json_base_directory = sys.argv[1]

for (dirpath, dirnames, filenames) in os.walk(json_base_directory):
    for filename in filenames:
        if "json" in filename:
            print("filename is " + filename)
            infile = dirpath + "/" + filename
            outfile = infile + ".docx"
            print("infile is " + infile)
            print("outfile is " + outfile)
            ret = tscribe.write(infile, save_as=outfile)

Example #19

0

Show file

def lambda_handler(event, context):

    s3 = boto3.client('s3')
    bucket = event['Records'][0]['s3']['bucket']['name']

    key = urllib.parse.unquote_plus(event['Records'][0]['s3']['object']['key'],
                                    encoding='utf=8')

    response = s3.get_object(Bucket=bucket, Key=key)

    # Read Data of file directly from S3
    body = response['Body'].read()

    res_dict = json.loads(body.decode('utf-8'))
    jsonData = json.dumps(res_dict)

    localkey = os.path.basename(os.path.splitext(key)[0])
    jsonFile = "/tmp/{}.json".format(localkey)

    with open(jsonFile, 'w') as f:
        f.write(jsonData)

    jsonFile = "/tmp/{}.json".format(localkey)
    csvFile = "/tmp/{}.csv".format(localkey)
    tscribe.write(jsonFile, format='csv', save_as=csvFile)

    df = pd.read_csv(csvFile)

    jsonFile = "/tmp/{}.json".format(localkey)
    result = df.to_json(orient="records")
    parsed = json.loads(result)

    jsonFile = "/tmp/{}.json".format(localkey)
    with open(jsonFile, 'w') as f:
        json.dump(parsed, f, indent=4)

    speakers = {}
    jsonFile = "/tmp/{}.json".format(localkey)
    with open(jsonFile, 'r') as f:
        data = json.load(f)

        for i in range(len(data)):
            if data[i]["speaker"] not in speakers:
                speakers[data[i]["speaker"]] = 1
            else:
                speakers[data[i]["speaker"]] += 1

    maxCountSpeaker = max(speakers, key=speakers.get)

    remove_speakers = []
    for speaker, count in speakers.items():
        if speaker != maxCountSpeaker:
            if speakers[speaker] < 3:
                print(speaker, count)
                remove_speakers.append(speaker)
            speakers[speaker] = int(count / 3)

    for speaker in remove_speakers:
        if speaker in speakers:
            del speakers[speaker]

    data = speakers

    jsonFile = "/tmp/{}.json".format(localkey)
    with open(jsonFile, 'w') as f:
        json.dump(data, f, indent=4)

    bucket_transcribe = 'surfboard-transcribe'
    json_upload_file = 'overlap-transcript/{}.json'.format(localkey)

    s3.upload_file(jsonFile, bucket_transcribe, json_upload_file)

    return {'statusCode': 200, 'body': json.dumps('Hello from Lambda!')}

Example #20

0

Show file

File: read_json_response.py Project: joseph9991/Milestone1

    'does', 'until', 'her', 'if', "haven't", "shan't", 'she', 'myself', 'such', 'i', 'now', 'how', "it's", 
    'very', 'off', "that'll", 'too', 'through', "you'd", 'up', 'between', "isn't", 'be', 'again', 
    'where', 'shan', 'have', 'about', 'when', 'our', 'it', "don't", "aren't", 'didn', 
    'under', 'mustn', 'me', 'he', 'down', "wouldn't", 'couldn', "hasn't", 'not', 'ours', 'wasn', 
    'doesn', 'in', 'from', 'yours', "couldn't", 'ma', 'an', 'as', 'why', 'can', 
    'do', 'what', "you'll", 'themselves', 'during', 'only', "didn't", 're', 'haven', 'are', 'these', 
    'has', 'my', 'their', 'yourself', 'to', "won't", 'there', 'needn', 'that', 'at', 'were', "hadn't", 'so', 'them', 
    'no', "needn't", 'being', 'we', 'did', 'should', 'and', 't', 'shouldn', 'm', 'isn', 'had', 'theirs', 'you', 
    'after', 'once', 'hasn', 'while', "weren't", 'further']
found_stopwords = []

all_fillerwords = ['Uh','Um', 'er', 'ah', 'like', 'okay', 'right,', 'you know','Um.','So,','so,',
                    'Right?','Uh,','uh,','uh','um,','Um,','um','okay,']
found_fillerwords = []

tscribe.write(file_name,format='csv',save_as='1.csv')

df = pd.read_csv('1.csv')


result = df.to_json(orient="records")
parsed = json.loads(result)
with open('temp.json','w') as f:
    json.dump(parsed,f,indent=4)

# Counting stopwords for every row
with open('temp.json', 'r') as jsonFile:
    data = json.load(jsonFile)
    new_data = {}
    for i in range(len(data)):
        word_tokens = word_tokenize(data[i]["comment"])

Example #21

0

Show file

File: task1_lambda_handler.py Project: joseph9991/Milestone1

def lambda_handler(event, context):

    bucket = event['Records'][0]['s3']['bucket']['name']

    key = urllib.parse.unquote_plus(event['Records'][0]['s3']['object']['key'],
                                    encoding='utf=8')

    response = s3.get_object(Bucket=bucket, Key=key)
    # Read Data of file directly from S3
    body = response['Body'].read()

    res_dict = json.loads(body.decode('utf-8'))
    jsonData = json.dumps(res_dict)

    localkey = os.path.basename(os.path.splitext(key)[0])
    jsonFile = "/tmp/{}.json".format(localkey)

    with open(jsonFile, 'w') as f:
        f.write(jsonData)

    csvFile = "/tmp/{}.csv".format(localkey)
    tscribe.write(jsonFile, format='csv', save_as=csvFile)
    df = pd.read_csv(csvFile)

    jsonFile = "/tmp/{}.json".format(localkey)
    result = df.to_json(orient="records")
    parsed = json.loads(result)

    with open(jsonFile, 'w') as f:
        json.dump(parsed, f, indent=4)

    jsonFile = "/tmp/{}.json".format(localkey)
    # Counting stopwords for every row
    with open(jsonFile, 'r') as f:
        data = json.load(f)
        new_data = {}
        for i in range(len(data)):

            stopwords = [
                word for word in data[i]["comment"].split()
                if word.lower() in nltk_stopwords
            ]

            fillerwords = [
                word for word in data[i]["comment"].split()
                if word.lower() in all_fillerwords
            ]

            found_stopwords.append(len(stopwords))
            found_fillerwords.append(len(fillerwords))

            data[i]['stopwords'] = len(stopwords)
            data[i]['fillerwords'] = len(fillerwords)

            if len(stopwords) > 0:

                if data[i]["speaker"] not in new_data:
                    new_data[data[i]["speaker"]] = {}
                    new_data[data[i]["speaker"]]["stopwords"] = {}
                    new_data[data[i]["speaker"]]["fillerwords"] = {}
                    for stopword in stopwords:
                        if stopword not in new_data[data[i]
                                                    ["speaker"]]["stopwords"]:
                            new_data[data[i]
                                     ["speaker"]]["stopwords"][stopword] = 1
                        else:
                            new_data[data[i]
                                     ["speaker"]]["stopwords"][stopword] += 1
                else:
                    for stopword in stopwords:
                        if stopword not in new_data[data[i]
                                                    ["speaker"]]["stopwords"]:
                            new_data[data[i]
                                     ["speaker"]]["stopwords"][stopword] = 1
                        else:
                            new_data[data[i]
                                     ["speaker"]]["stopwords"][stopword] += 1

            if len(fillerwords) > 0:
                if data[i]["speaker"] not in new_data:
                    new_data[data[i]["speaker"]] = {}
                    new_data[data[i]["speaker"]]["stopwords"] = {}
                    new_data[data[i]["speaker"]]["fillerwords"] = {}
                    for fillerword in fillerwords:
                        if fillerword not in new_data[
                                data[i]["speaker"]]["fillerwords"]:
                            new_data[data[i]["speaker"]]["fillerwords"][
                                fillerword] = 1
                        else:
                            new_data[data[i]["speaker"]]["fillerwords"][
                                fillerword] += 1
                else:
                    for fillerword in fillerwords:
                        if fillerword not in new_data[
                                data[i]["speaker"]]["fillerwords"]:
                            new_data[data[i]["speaker"]]["fillerwords"][
                                fillerword] = 1
                        else:
                            new_data[data[i]["speaker"]]["fillerwords"][
                                fillerword] += 1

        data = [data, new_data]

    jsonFile = "/tmp/{}.json".format(localkey)
    with open(jsonFile, 'w') as f:
        json.dump(data, f, indent=4)

    bucket_transcribe = 'surfboard-transcribe'
    json_upload_file = 'transcript/{}.json'.format(localkey)

    s3.upload_file(jsonFile, bucket_transcribe, json_upload_file)

    return {'statusCode': 200, 'body': json.dumps('Hello from Lambda!')}

Example #22

0

Show file

def test_process():
    assert os.access("test_sample.json", os.F_OK), "Input file not found"
    tscribe.write("test_sample.json", save_as="test_sample.docx")
    assert os.access("test_sample.docx", os.F_OK), "Output file not found"
    os.remove("test_sample.docx")