Esempio n. 1
0
def test_save_as(input_file, output_format, location):
    """
    Test saving of supported formats to locations

    GIVEN locations of current or specific folder
    WHEN writing transcript in any supported format
    THEN check output exists
    """

    logging.info("test_save_as")

    if not Path("output").is_dir():
        os.mkdir("output")

    # GIVEN locations of current or specific folder
    output_filename = Path(location) / Path(Path(input_file).parts[-1]).with_suffix(
        f".{output_format}"
    )

    # WHEN writing transcript in any supported format
    tscribe.write(input_file, format=output_format, save_as=output_filename)

    # THEN check output exists
    assert output_filename.is_file()

    os.remove(output_filename)
    def export_files(self):
        """
        Export all the resulted JSON file(s) as Word docx using Tscribe and archive the source files in 'Archive' folder. 
        """
        try:
            logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')

            bucket = self.s3_resource.Bucket(self.output_bucket_name)
            archive_path = config['file_paths']['archive_path']

            for obj in bucket.objects.filter(Delimiter='/'):
                obj_name, obj_extn = os.path.splitext(obj.key)
                if obj_extn == '.json':
                    self.s3_resource.meta.client.download_file(self.output_bucket_name, obj.key, os.path.join(self.output_path, obj.key))
                    file_content = obj.get()['Body'].read().decode('utf-8')
                    json_content = json.loads(file_content)
                    if json_content['results']['transcripts'][0]['transcript'] != "" :
                        json_file_path = os.path.join(self.output_path, obj.key)
                        save_as_path = os.path.join(self.output_path, obj_name +'.docx')
                        tscribe.write(json_file_path, format="docx", save_as= save_as_path)
                        self.archive_object(archive_path, '', '', obj.key)
                        
        except ClientError:
            logger.exception("Failed to export files.")
            raise
Esempio n. 3
0
def test_write_to_sqlite(input_file):
    """
    Test production of sqlite output

    GIVEN an input file
    WHEN writing to sqlite
    THEN check output exists and contains content
    """

    logging.info("test_write_to_sqlite")

    # GIVEN an input file
    # WHEN writing to sqlite
    output_filename = Path(f"{uuid4().hex}.db")
    tscribe.write(input_file, save_as=output_filename, format="sqlite")

    # THEN check output exists and contains content
    assert output_filename.is_file(), "Output file should exist"

    conn = sqlite3.connect(str(output_filename))
    c = conn.cursor()
    c.execute("SELECT * FROM transcript")
    query = c.fetchall()

    data = tscribe.load_json_as_dict(input_file)
    df = tscribe.decode_transcript_to_dataframe(data)

    assert len(query) == len(df), "Database table should be length of dataframe"

    # Teardown
    os.remove(output_filename)
Esempio n. 4
0
def test_write_to_csv(input_file):
    """
    Test production of csv output

    GIVEN an input file
    WHEN writing to csv
    THEN check output exists and contains content
    """

    logging.info("test_write_to_csv")

    # GIVEN an input file
    # WHEN writing to csv
    output_filename = Path(f"{uuid4().hex}.csv")
    tscribe.write(input_file, save_as=output_filename, format="csv")

    # THEN check output exists and contains content
    assert output_filename.is_file(), "Output file should exist"

    with open(output_filename, "r") as file:
        lines = file.readlines()

    data = tscribe.load_json_as_dict(input_file)
    df = tscribe.decode_transcript_to_dataframe(data)

    assert len(lines) == len(df) + 1, "CSV should be length of dataframe + headers"

    # Teardown
    os.remove(output_filename)
Esempio n. 5
0
def test_write_to_docx(input_file):
    """
    Test production of docx output

    GIVEN an input file
    WHEN writing to docx
    THEN check output exists and contains content
    """

    logging.info("test_write_to_docx")

    # GIVEN an input file
    # WHEN writing to docx
    output_filename = Path(f"{uuid4().hex}.docx")
    tscribe.write(input_file, save_as=output_filename, format="docx")

    # THEN check output exists and contains content
    assert output_filename.is_file(), "Output file should exist"

    document = Document(output_filename)

    assert (
        len(document.tables) == 2
    ), "Document should contain two tables, stats and transcript"

    t_conf = document.tables[0].cell(0, 0).text
    t_count = document.tables[0].cell(0, 1).text
    t_perc = document.tables[0].cell(0, 2).text
    assert (t_conf, t_count, t_perc) == (
        "Confidence",
        "Count",
        "Percentage",
    ), "First table should be stats headers"
    assert len(document.tables[0].rows) == 12, "Stats table should hold 12 rows"

    t_time = document.tables[1].cell(0, 0).text
    t_speaker = document.tables[1].cell(0, 1).text
    t_content = document.tables[1].cell(0, 2).text
    assert (t_time, t_speaker, t_content) == (
        "Time",
        "Speaker",
        "Content",
    ), "Second table should be transcript headers"
    data = tscribe.load_json_as_dict(input_file)
    df = tscribe.decode_transcript_to_dataframe(data)
    assert (
        len(document.tables[1].rows) == len(df) + 1
    ), "Second table should be length of dataframe + headers"

    assert (
        "chart.png" in document.paragraphs[6]._p.xml
    ), "Chart should be in paragraph six"

    # Teardown
    os.remove(output_filename)
def handler(event, context):
    transcribe = boto3.client('transcribe', region_name='eu-west-2')
    for record in event['Records']:
        x = datetime.datetime.now()

        s3_bucket = record['s3']['bucket']['name']
        s3_filekey = unquote_plus(record['s3']['object']['key'])
        job_name = "Transcribe-Video-" + x.strftime("%Y-%m-%d-%H-%M-%S")
        job_uri = "s3://" + s3_bucket + "/" + s3_filekey
        print(job_uri)

        transcribe.start_transcription_job(TranscriptionJobName=job_name,
                                           Media={'MediaFileUri': job_uri},
                                           MediaFormat='mp4',
                                           LanguageCode='en-US')

        while True:
            status = transcribe.get_transcription_job(
                TranscriptionJobName=job_name)
            if status['TranscriptionJob']['TranscriptionJobStatus'] in [
                    'COMPLETED', 'FAILED'
            ]:
                break
            print("Transcribing...")
            time.sleep(5)

        s3_client = boto3.client('s3')
        try:
            local_json_file = '/tmp/local_saved_file'
            urllib.request.urlretrieve(
                status['TranscriptionJob']['Transcript']['TranscriptFileUri'],
                local_json_file)
            bucket = os.getenv('ENV_S3BUCKET')
            object_json_name = s3_filekey + '-transcript.json'
            response = s3_client.upload_file(local_json_file, bucket,
                                             object_json_name)

            # convert json to docx
            object_docx_name = s3_filekey + "-transcript.docx"
            local_docx_file = '/tmp/' + object_docx_name
            tmp_dir = '/tmp/'
            tscribe.write(local_json_file,
                          save_as=local_docx_file,
                          tmp_dir=tmp_dir)

            #upload docx to s3
            response = s3_client.upload_file(local_docx_file, bucket,
                                             object_docx_name)

        except ClientError as e:
            logging.error(e)
            return False

    return "Complete"
Esempio n. 7
0
def test_depricated_tmp_dir(input_file):
    """
    Test that using tmp_dir fails

    GIVEN an input file
    WHEN calling tscribe with tmp_dir
    THEN receive warning and fail
    """

    logging.info("test_deprecated_tmp_dir")

    # GIVEN an input file
    # WHEN calling tscribe with tmp_dir
    # THEN receive warning and fail
    tscribe.write(input_file, tmp_dir=".")
Esempio n. 8
0
def test_unrecognised_output_format(input_file):
    """
    Test for exception when given unrecognised output format

    GIVEN an input file and an unrecognised output format
    WHEN calling tscribe.write(...)
    THEN xfail
    """

    # GIVEN an input file and an unrecognised output format
    unrecognised_format = "unrecognised"

    # WHEN calling tscribe.write(...)
    # THEN xfail
    tscribe.write(input_file, format=unrecognised_format)
def test_write_to_default(input_file):
    """
    Test production of default output
    
    GIVEN an input file
    WHEN not specifying output
    THEN check output is the default format
    """

    # GIVEN an input file
    # WHEN not specifying output
    tscribe.write(input_file)
    expected_filename = input_file.replace(".json", ".docx")
    output_filename = Path(expected_filename)

    # THEN check output exists and contains content
    assert output_filename.is_file(), "Output file should exist"

    # Teardown
    os.remove(output_filename)
Esempio n. 10
0
def replaceName(Speaker_IDs, speakers):
    # Obtain the latest json files
    json_files = glob.glob('data/transcribe/*.json')
    latest_json = max(json_files, key=os.path.getctime)

    # Replace speakers wiopenth Speaker_IDs in the json file
    with open(latest_json, 'r', encoding='utf-8') as f:
        json_file = json.load(f)
        txt = json.dumps(json_file,)
    for i in range(len(speakers)):
        txt = txt.replace(Speaker_IDs[i], speakers[i])
    print(txt)

    new_path = latest_json.replace('transcribe', 'report')
    with open(new_path, 'w') as f:
        f.write(txt)

    tscribe.write(new_path,
                  format="csv",
                  save_as=new_path.replace('.json', '.csv'))
    os.remove(new_path)
Esempio n. 11
0
def test_write_to_vtt(input_file):
    """
    Test production of vtt format

    GIVEN an input file
    WHEN writing to vtt
    THEN check output exists and contains content
    """

    logging.info("test_write_to_vtt")

    # GIVEN an input file
    # WHEN writing to vtt
    output_filename = Path(f"{uuid4().hex}.vtt")
    tscribe.write(input_file, save_as=output_filename, format="vtt")

    # THEN check output exists and contains content
    vtt = webvtt.read(output_filename)

    data = tscribe.load_json_as_dict(input_file)
    df = tscribe.decode_transcript_to_dataframe(data)
    assert len(vtt.captions) == len(
        df
    ), "vtt file should have equal captions to df rows"

    for caption in vtt.captions:

        assert hasattr(caption, "start"), "each caption should have a start_time"
        assert hasattr(caption, "end"), "each caption should have a end_time"
        assert hasattr(caption, "text"), "each caption should have text"
        assert (
            len(caption.lines) >= len(caption.text) / 80
        ), "text should be split into max 80 long lines"
        if input_file != "sample_single.json":
            assert hasattr(
                caption, "identifier"
            ), "each caption should have an identifier"

    # Teardown
    os.remove(output_filename)
Esempio n. 12
0
def test_single_speaker():
    """
    Test output exists with single speaker input

    # GIVEN a sample file containing single speaker
    # WHEN calling tscribe.write(...)
    # THEN produce the .docx without errors

    """

    # Setup
    input_file = "sample_single.json"
    output_file = "sample_single.docx"
    assert os.access(input_file, os.F_OK), "Input file not found"

    # Function
    tscribe.write(input_file)
    assert os.access(output_file, os.F_OK), "Output file not found"

    # Teardown
    os.remove(output_file)
    os.remove("chart.png")
Esempio n. 13
0
def test_multiple_speakers():
    """
    Test output exists with multiple speaker input

    # GIVEN a sample file containing multiple speakers
    # WHEN calling tscribe.write(...)
    # THEN produce the .docx without errors

    """

    # Setup
    input_file = "sample_material/03-speaker-identification.json"
    output_file = "sample_material/03-speaker-identification.docx"
    assert os.access(input_file, os.F_OK), "Input file not found"

    # Function
    tscribe.write(input_file)
    assert os.access(output_file, os.F_OK), "Output file not found"

    # Teardown
    os.remove(output_file)
    os.remove("sample_material/chart.png")
Esempio n. 14
0
def test_multiple_speakers_with_save_as():
    """
    Test output exists with multiple speaker input, and save_as defined

    # GIVEN a sample file containing multiple speakers, and an output filename
    # WHEN calling tscribe.write(...)
    # THEN produce the .docx, named correctly, without errors

    """

    # Setup
    input_file = "sample_multiple.json"
    output_file = "test_sample.docx"
    assert os.access(input_file, os.F_OK), "Input file not found"

    # Function
    tscribe.write(input_file, save_as=output_file)
    assert os.access(output_file, os.F_OK), "Output file not found"

    # Teardown
    os.remove(output_file)
    os.remove("chart.png")
Esempio n. 15
0
def test_multiple_speakers_with_save_as_with_tmp_dir():
    """
    Test output exists with multiple speaker input, and save_as defined, and tmp_dir defined

    # GIVEN a sample file containing multiple speakers, and an output filename, and a writable tmp directory
    # WHEN calling tscribe.write(...)
    # THEN produce the .docx, with a chart, named correctly, without errors

    """

    # Setup
    input_file = "sample_multiple.json"
    output_file = "test_sample.docx"
    tmp_dir = "/tmp/"
    assert os.access(input_file, os.F_OK), "Input file not found"

    # Function
    tscribe.write(input_file, save_as=output_file, tmp_dir=tmp_dir)
    assert os.access(tmp_dir + "chart.png", os.F_OK), "Chart file not found"
    assert os.access(output_file, os.F_OK), "Output file not found"

    # Teardown
    os.remove(output_file)
    os.remove(tmp_dir + "chart.png")
Esempio n. 16
0
def test_save_as(input_file, output_format, location):
    """
    Test saving of supported formats to locations
    
    GIVEN locations of current or specific folder
    WHEN writing transcript in any supported format
    THEN check output exists
    """

    if not Path("output").is_dir():
        os.mkdir("output")

    # GIVEN locations of current or specific folder
    output_filename = Path(location) / Path(
        input_file.replace(".json", f".{output_format}")
    )

    # WHEN writing transcript in any supported format
    tscribe.write(input_file, format=output_format, save_as=output_filename)

    # THEN check output exists
    assert output_filename.is_file()

    os.remove(output_filename)
Esempio n. 17
0
import tscribe
import sys

file_name = sys.argv[1]
tscribe.write(file_name, format="csv", save_as="1.csv")
import tscribe
import os

numargs = len(sys.argv)

print("Number of arguments:" + str(numargs))
print(" arguments " + str(sys.argv))
if numargs > 1:
    usage_demo(numargs, sys.argv)
else:
   print("must pass directory path for json files to walk")
   exit()

json_base_directory = sys.argv[1]

for (dirpath, dirnames, filenames) in os.walk(json_base_directory):
    for filename in filenames:
        if "json" in filename:
            print("filename is " + filename)
            infile = dirpath + "/" + filename
            outfile = infile + ".docx"
            print("infile is " + infile)
            print("outfile is " + outfile)
            ret = tscribe.write(infile, save_as=outfile)
Esempio n. 19
0
def lambda_handler(event, context):

    s3 = boto3.client('s3')
    bucket = event['Records'][0]['s3']['bucket']['name']

    key = urllib.parse.unquote_plus(event['Records'][0]['s3']['object']['key'],
                                    encoding='utf=8')

    response = s3.get_object(Bucket=bucket, Key=key)

    # Read Data of file directly from S3
    body = response['Body'].read()

    res_dict = json.loads(body.decode('utf-8'))
    jsonData = json.dumps(res_dict)

    localkey = os.path.basename(os.path.splitext(key)[0])
    jsonFile = "/tmp/{}.json".format(localkey)

    with open(jsonFile, 'w') as f:
        f.write(jsonData)

    jsonFile = "/tmp/{}.json".format(localkey)
    csvFile = "/tmp/{}.csv".format(localkey)
    tscribe.write(jsonFile, format='csv', save_as=csvFile)

    df = pd.read_csv(csvFile)

    jsonFile = "/tmp/{}.json".format(localkey)
    result = df.to_json(orient="records")
    parsed = json.loads(result)

    jsonFile = "/tmp/{}.json".format(localkey)
    with open(jsonFile, 'w') as f:
        json.dump(parsed, f, indent=4)

    speakers = {}
    jsonFile = "/tmp/{}.json".format(localkey)
    with open(jsonFile, 'r') as f:
        data = json.load(f)

        for i in range(len(data)):
            if data[i]["speaker"] not in speakers:
                speakers[data[i]["speaker"]] = 1
            else:
                speakers[data[i]["speaker"]] += 1

    maxCountSpeaker = max(speakers, key=speakers.get)

    remove_speakers = []
    for speaker, count in speakers.items():
        if speaker != maxCountSpeaker:
            if speakers[speaker] < 3:
                print(speaker, count)
                remove_speakers.append(speaker)
            speakers[speaker] = int(count / 3)

    for speaker in remove_speakers:
        if speaker in speakers:
            del speakers[speaker]

    data = speakers

    jsonFile = "/tmp/{}.json".format(localkey)
    with open(jsonFile, 'w') as f:
        json.dump(data, f, indent=4)

    bucket_transcribe = 'surfboard-transcribe'
    json_upload_file = 'overlap-transcript/{}.json'.format(localkey)

    s3.upload_file(jsonFile, bucket_transcribe, json_upload_file)

    return {'statusCode': 200, 'body': json.dumps('Hello from Lambda!')}
Esempio n. 20
0
    'does', 'until', 'her', 'if', "haven't", "shan't", 'she', 'myself', 'such', 'i', 'now', 'how', "it's", 
    'very', 'off', "that'll", 'too', 'through', "you'd", 'up', 'between', "isn't", 'be', 'again', 
    'where', 'shan', 'have', 'about', 'when', 'our', 'it', "don't", "aren't", 'didn', 
    'under', 'mustn', 'me', 'he', 'down', "wouldn't", 'couldn', "hasn't", 'not', 'ours', 'wasn', 
    'doesn', 'in', 'from', 'yours', "couldn't", 'ma', 'an', 'as', 'why', 'can', 
    'do', 'what', "you'll", 'themselves', 'during', 'only', "didn't", 're', 'haven', 'are', 'these', 
    'has', 'my', 'their', 'yourself', 'to', "won't", 'there', 'needn', 'that', 'at', 'were', "hadn't", 'so', 'them', 
    'no', "needn't", 'being', 'we', 'did', 'should', 'and', 't', 'shouldn', 'm', 'isn', 'had', 'theirs', 'you', 
    'after', 'once', 'hasn', 'while', "weren't", 'further']
found_stopwords = []

all_fillerwords = ['Uh','Um', 'er', 'ah', 'like', 'okay', 'right,', 'you know','Um.','So,','so,',
                    'Right?','Uh,','uh,','uh','um,','Um,','um','okay,']
found_fillerwords = []

tscribe.write(file_name,format='csv',save_as='1.csv')

df = pd.read_csv('1.csv')


result = df.to_json(orient="records")
parsed = json.loads(result)
with open('temp.json','w') as f:
    json.dump(parsed,f,indent=4)

# Counting stopwords for every row
with open('temp.json', 'r') as jsonFile:
    data = json.load(jsonFile)
    new_data = {}
    for i in range(len(data)):
        word_tokens = word_tokenize(data[i]["comment"])
def lambda_handler(event, context):

    bucket = event['Records'][0]['s3']['bucket']['name']

    key = urllib.parse.unquote_plus(event['Records'][0]['s3']['object']['key'],
                                    encoding='utf=8')

    response = s3.get_object(Bucket=bucket, Key=key)
    # Read Data of file directly from S3
    body = response['Body'].read()

    res_dict = json.loads(body.decode('utf-8'))
    jsonData = json.dumps(res_dict)

    localkey = os.path.basename(os.path.splitext(key)[0])
    jsonFile = "/tmp/{}.json".format(localkey)

    with open(jsonFile, 'w') as f:
        f.write(jsonData)

    csvFile = "/tmp/{}.csv".format(localkey)
    tscribe.write(jsonFile, format='csv', save_as=csvFile)
    df = pd.read_csv(csvFile)

    jsonFile = "/tmp/{}.json".format(localkey)
    result = df.to_json(orient="records")
    parsed = json.loads(result)

    with open(jsonFile, 'w') as f:
        json.dump(parsed, f, indent=4)

    jsonFile = "/tmp/{}.json".format(localkey)
    # Counting stopwords for every row
    with open(jsonFile, 'r') as f:
        data = json.load(f)
        new_data = {}
        for i in range(len(data)):

            stopwords = [
                word for word in data[i]["comment"].split()
                if word.lower() in nltk_stopwords
            ]

            fillerwords = [
                word for word in data[i]["comment"].split()
                if word.lower() in all_fillerwords
            ]

            found_stopwords.append(len(stopwords))
            found_fillerwords.append(len(fillerwords))

            data[i]['stopwords'] = len(stopwords)
            data[i]['fillerwords'] = len(fillerwords)

            if len(stopwords) > 0:

                if data[i]["speaker"] not in new_data:
                    new_data[data[i]["speaker"]] = {}
                    new_data[data[i]["speaker"]]["stopwords"] = {}
                    new_data[data[i]["speaker"]]["fillerwords"] = {}
                    for stopword in stopwords:
                        if stopword not in new_data[data[i]
                                                    ["speaker"]]["stopwords"]:
                            new_data[data[i]
                                     ["speaker"]]["stopwords"][stopword] = 1
                        else:
                            new_data[data[i]
                                     ["speaker"]]["stopwords"][stopword] += 1
                else:
                    for stopword in stopwords:
                        if stopword not in new_data[data[i]
                                                    ["speaker"]]["stopwords"]:
                            new_data[data[i]
                                     ["speaker"]]["stopwords"][stopword] = 1
                        else:
                            new_data[data[i]
                                     ["speaker"]]["stopwords"][stopword] += 1

            if len(fillerwords) > 0:
                if data[i]["speaker"] not in new_data:
                    new_data[data[i]["speaker"]] = {}
                    new_data[data[i]["speaker"]]["stopwords"] = {}
                    new_data[data[i]["speaker"]]["fillerwords"] = {}
                    for fillerword in fillerwords:
                        if fillerword not in new_data[
                                data[i]["speaker"]]["fillerwords"]:
                            new_data[data[i]["speaker"]]["fillerwords"][
                                fillerword] = 1
                        else:
                            new_data[data[i]["speaker"]]["fillerwords"][
                                fillerword] += 1
                else:
                    for fillerword in fillerwords:
                        if fillerword not in new_data[
                                data[i]["speaker"]]["fillerwords"]:
                            new_data[data[i]["speaker"]]["fillerwords"][
                                fillerword] = 1
                        else:
                            new_data[data[i]["speaker"]]["fillerwords"][
                                fillerword] += 1

        data = [data, new_data]

    jsonFile = "/tmp/{}.json".format(localkey)
    with open(jsonFile, 'w') as f:
        json.dump(data, f, indent=4)

    bucket_transcribe = 'surfboard-transcribe'
    json_upload_file = 'transcript/{}.json'.format(localkey)

    s3.upload_file(jsonFile, bucket_transcribe, json_upload_file)

    return {'statusCode': 200, 'body': json.dumps('Hello from Lambda!')}
Esempio n. 22
0
def test_process():
    assert os.access("test_sample.json", os.F_OK), "Input file not found"
    tscribe.write("test_sample.json", save_as="test_sample.docx")
    assert os.access("test_sample.docx", os.F_OK), "Output file not found"
    os.remove("test_sample.docx")