def test_save_as(input_file, output_format, location): """ Test saving of supported formats to locations GIVEN locations of current or specific folder WHEN writing transcript in any supported format THEN check output exists """ logging.info("test_save_as") if not Path("output").is_dir(): os.mkdir("output") # GIVEN locations of current or specific folder output_filename = Path(location) / Path(Path(input_file).parts[-1]).with_suffix( f".{output_format}" ) # WHEN writing transcript in any supported format tscribe.write(input_file, format=output_format, save_as=output_filename) # THEN check output exists assert output_filename.is_file() os.remove(output_filename)
def export_files(self): """ Export all the resulted JSON file(s) as Word docx using Tscribe and archive the source files in 'Archive' folder. """ try: logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s') bucket = self.s3_resource.Bucket(self.output_bucket_name) archive_path = config['file_paths']['archive_path'] for obj in bucket.objects.filter(Delimiter='/'): obj_name, obj_extn = os.path.splitext(obj.key) if obj_extn == '.json': self.s3_resource.meta.client.download_file(self.output_bucket_name, obj.key, os.path.join(self.output_path, obj.key)) file_content = obj.get()['Body'].read().decode('utf-8') json_content = json.loads(file_content) if json_content['results']['transcripts'][0]['transcript'] != "" : json_file_path = os.path.join(self.output_path, obj.key) save_as_path = os.path.join(self.output_path, obj_name +'.docx') tscribe.write(json_file_path, format="docx", save_as= save_as_path) self.archive_object(archive_path, '', '', obj.key) except ClientError: logger.exception("Failed to export files.") raise
def test_write_to_sqlite(input_file): """ Test production of sqlite output GIVEN an input file WHEN writing to sqlite THEN check output exists and contains content """ logging.info("test_write_to_sqlite") # GIVEN an input file # WHEN writing to sqlite output_filename = Path(f"{uuid4().hex}.db") tscribe.write(input_file, save_as=output_filename, format="sqlite") # THEN check output exists and contains content assert output_filename.is_file(), "Output file should exist" conn = sqlite3.connect(str(output_filename)) c = conn.cursor() c.execute("SELECT * FROM transcript") query = c.fetchall() data = tscribe.load_json_as_dict(input_file) df = tscribe.decode_transcript_to_dataframe(data) assert len(query) == len(df), "Database table should be length of dataframe" # Teardown os.remove(output_filename)
def test_write_to_csv(input_file): """ Test production of csv output GIVEN an input file WHEN writing to csv THEN check output exists and contains content """ logging.info("test_write_to_csv") # GIVEN an input file # WHEN writing to csv output_filename = Path(f"{uuid4().hex}.csv") tscribe.write(input_file, save_as=output_filename, format="csv") # THEN check output exists and contains content assert output_filename.is_file(), "Output file should exist" with open(output_filename, "r") as file: lines = file.readlines() data = tscribe.load_json_as_dict(input_file) df = tscribe.decode_transcript_to_dataframe(data) assert len(lines) == len(df) + 1, "CSV should be length of dataframe + headers" # Teardown os.remove(output_filename)
def test_write_to_docx(input_file): """ Test production of docx output GIVEN an input file WHEN writing to docx THEN check output exists and contains content """ logging.info("test_write_to_docx") # GIVEN an input file # WHEN writing to docx output_filename = Path(f"{uuid4().hex}.docx") tscribe.write(input_file, save_as=output_filename, format="docx") # THEN check output exists and contains content assert output_filename.is_file(), "Output file should exist" document = Document(output_filename) assert ( len(document.tables) == 2 ), "Document should contain two tables, stats and transcript" t_conf = document.tables[0].cell(0, 0).text t_count = document.tables[0].cell(0, 1).text t_perc = document.tables[0].cell(0, 2).text assert (t_conf, t_count, t_perc) == ( "Confidence", "Count", "Percentage", ), "First table should be stats headers" assert len(document.tables[0].rows) == 12, "Stats table should hold 12 rows" t_time = document.tables[1].cell(0, 0).text t_speaker = document.tables[1].cell(0, 1).text t_content = document.tables[1].cell(0, 2).text assert (t_time, t_speaker, t_content) == ( "Time", "Speaker", "Content", ), "Second table should be transcript headers" data = tscribe.load_json_as_dict(input_file) df = tscribe.decode_transcript_to_dataframe(data) assert ( len(document.tables[1].rows) == len(df) + 1 ), "Second table should be length of dataframe + headers" assert ( "chart.png" in document.paragraphs[6]._p.xml ), "Chart should be in paragraph six" # Teardown os.remove(output_filename)
def handler(event, context): transcribe = boto3.client('transcribe', region_name='eu-west-2') for record in event['Records']: x = datetime.datetime.now() s3_bucket = record['s3']['bucket']['name'] s3_filekey = unquote_plus(record['s3']['object']['key']) job_name = "Transcribe-Video-" + x.strftime("%Y-%m-%d-%H-%M-%S") job_uri = "s3://" + s3_bucket + "/" + s3_filekey print(job_uri) transcribe.start_transcription_job(TranscriptionJobName=job_name, Media={'MediaFileUri': job_uri}, MediaFormat='mp4', LanguageCode='en-US') while True: status = transcribe.get_transcription_job( TranscriptionJobName=job_name) if status['TranscriptionJob']['TranscriptionJobStatus'] in [ 'COMPLETED', 'FAILED' ]: break print("Transcribing...") time.sleep(5) s3_client = boto3.client('s3') try: local_json_file = '/tmp/local_saved_file' urllib.request.urlretrieve( status['TranscriptionJob']['Transcript']['TranscriptFileUri'], local_json_file) bucket = os.getenv('ENV_S3BUCKET') object_json_name = s3_filekey + '-transcript.json' response = s3_client.upload_file(local_json_file, bucket, object_json_name) # convert json to docx object_docx_name = s3_filekey + "-transcript.docx" local_docx_file = '/tmp/' + object_docx_name tmp_dir = '/tmp/' tscribe.write(local_json_file, save_as=local_docx_file, tmp_dir=tmp_dir) #upload docx to s3 response = s3_client.upload_file(local_docx_file, bucket, object_docx_name) except ClientError as e: logging.error(e) return False return "Complete"
def test_depricated_tmp_dir(input_file): """ Test that using tmp_dir fails GIVEN an input file WHEN calling tscribe with tmp_dir THEN receive warning and fail """ logging.info("test_deprecated_tmp_dir") # GIVEN an input file # WHEN calling tscribe with tmp_dir # THEN receive warning and fail tscribe.write(input_file, tmp_dir=".")
def test_unrecognised_output_format(input_file): """ Test for exception when given unrecognised output format GIVEN an input file and an unrecognised output format WHEN calling tscribe.write(...) THEN xfail """ # GIVEN an input file and an unrecognised output format unrecognised_format = "unrecognised" # WHEN calling tscribe.write(...) # THEN xfail tscribe.write(input_file, format=unrecognised_format)
def test_write_to_default(input_file): """ Test production of default output GIVEN an input file WHEN not specifying output THEN check output is the default format """ # GIVEN an input file # WHEN not specifying output tscribe.write(input_file) expected_filename = input_file.replace(".json", ".docx") output_filename = Path(expected_filename) # THEN check output exists and contains content assert output_filename.is_file(), "Output file should exist" # Teardown os.remove(output_filename)
def replaceName(Speaker_IDs, speakers): # Obtain the latest json files json_files = glob.glob('data/transcribe/*.json') latest_json = max(json_files, key=os.path.getctime) # Replace speakers wiopenth Speaker_IDs in the json file with open(latest_json, 'r', encoding='utf-8') as f: json_file = json.load(f) txt = json.dumps(json_file,) for i in range(len(speakers)): txt = txt.replace(Speaker_IDs[i], speakers[i]) print(txt) new_path = latest_json.replace('transcribe', 'report') with open(new_path, 'w') as f: f.write(txt) tscribe.write(new_path, format="csv", save_as=new_path.replace('.json', '.csv')) os.remove(new_path)
def test_write_to_vtt(input_file): """ Test production of vtt format GIVEN an input file WHEN writing to vtt THEN check output exists and contains content """ logging.info("test_write_to_vtt") # GIVEN an input file # WHEN writing to vtt output_filename = Path(f"{uuid4().hex}.vtt") tscribe.write(input_file, save_as=output_filename, format="vtt") # THEN check output exists and contains content vtt = webvtt.read(output_filename) data = tscribe.load_json_as_dict(input_file) df = tscribe.decode_transcript_to_dataframe(data) assert len(vtt.captions) == len( df ), "vtt file should have equal captions to df rows" for caption in vtt.captions: assert hasattr(caption, "start"), "each caption should have a start_time" assert hasattr(caption, "end"), "each caption should have a end_time" assert hasattr(caption, "text"), "each caption should have text" assert ( len(caption.lines) >= len(caption.text) / 80 ), "text should be split into max 80 long lines" if input_file != "sample_single.json": assert hasattr( caption, "identifier" ), "each caption should have an identifier" # Teardown os.remove(output_filename)
def test_single_speaker(): """ Test output exists with single speaker input # GIVEN a sample file containing single speaker # WHEN calling tscribe.write(...) # THEN produce the .docx without errors """ # Setup input_file = "sample_single.json" output_file = "sample_single.docx" assert os.access(input_file, os.F_OK), "Input file not found" # Function tscribe.write(input_file) assert os.access(output_file, os.F_OK), "Output file not found" # Teardown os.remove(output_file) os.remove("chart.png")
def test_multiple_speakers(): """ Test output exists with multiple speaker input # GIVEN a sample file containing multiple speakers # WHEN calling tscribe.write(...) # THEN produce the .docx without errors """ # Setup input_file = "sample_material/03-speaker-identification.json" output_file = "sample_material/03-speaker-identification.docx" assert os.access(input_file, os.F_OK), "Input file not found" # Function tscribe.write(input_file) assert os.access(output_file, os.F_OK), "Output file not found" # Teardown os.remove(output_file) os.remove("sample_material/chart.png")
def test_multiple_speakers_with_save_as(): """ Test output exists with multiple speaker input, and save_as defined # GIVEN a sample file containing multiple speakers, and an output filename # WHEN calling tscribe.write(...) # THEN produce the .docx, named correctly, without errors """ # Setup input_file = "sample_multiple.json" output_file = "test_sample.docx" assert os.access(input_file, os.F_OK), "Input file not found" # Function tscribe.write(input_file, save_as=output_file) assert os.access(output_file, os.F_OK), "Output file not found" # Teardown os.remove(output_file) os.remove("chart.png")
def test_multiple_speakers_with_save_as_with_tmp_dir(): """ Test output exists with multiple speaker input, and save_as defined, and tmp_dir defined # GIVEN a sample file containing multiple speakers, and an output filename, and a writable tmp directory # WHEN calling tscribe.write(...) # THEN produce the .docx, with a chart, named correctly, without errors """ # Setup input_file = "sample_multiple.json" output_file = "test_sample.docx" tmp_dir = "/tmp/" assert os.access(input_file, os.F_OK), "Input file not found" # Function tscribe.write(input_file, save_as=output_file, tmp_dir=tmp_dir) assert os.access(tmp_dir + "chart.png", os.F_OK), "Chart file not found" assert os.access(output_file, os.F_OK), "Output file not found" # Teardown os.remove(output_file) os.remove(tmp_dir + "chart.png")
def test_save_as(input_file, output_format, location): """ Test saving of supported formats to locations GIVEN locations of current or specific folder WHEN writing transcript in any supported format THEN check output exists """ if not Path("output").is_dir(): os.mkdir("output") # GIVEN locations of current or specific folder output_filename = Path(location) / Path( input_file.replace(".json", f".{output_format}") ) # WHEN writing transcript in any supported format tscribe.write(input_file, format=output_format, save_as=output_filename) # THEN check output exists assert output_filename.is_file() os.remove(output_filename)
import tscribe import sys file_name = sys.argv[1] tscribe.write(file_name, format="csv", save_as="1.csv")
import tscribe import os numargs = len(sys.argv) print("Number of arguments:" + str(numargs)) print(" arguments " + str(sys.argv)) if numargs > 1: usage_demo(numargs, sys.argv) else: print("must pass directory path for json files to walk") exit() json_base_directory = sys.argv[1] for (dirpath, dirnames, filenames) in os.walk(json_base_directory): for filename in filenames: if "json" in filename: print("filename is " + filename) infile = dirpath + "/" + filename outfile = infile + ".docx" print("infile is " + infile) print("outfile is " + outfile) ret = tscribe.write(infile, save_as=outfile)
def lambda_handler(event, context): s3 = boto3.client('s3') bucket = event['Records'][0]['s3']['bucket']['name'] key = urllib.parse.unquote_plus(event['Records'][0]['s3']['object']['key'], encoding='utf=8') response = s3.get_object(Bucket=bucket, Key=key) # Read Data of file directly from S3 body = response['Body'].read() res_dict = json.loads(body.decode('utf-8')) jsonData = json.dumps(res_dict) localkey = os.path.basename(os.path.splitext(key)[0]) jsonFile = "/tmp/{}.json".format(localkey) with open(jsonFile, 'w') as f: f.write(jsonData) jsonFile = "/tmp/{}.json".format(localkey) csvFile = "/tmp/{}.csv".format(localkey) tscribe.write(jsonFile, format='csv', save_as=csvFile) df = pd.read_csv(csvFile) jsonFile = "/tmp/{}.json".format(localkey) result = df.to_json(orient="records") parsed = json.loads(result) jsonFile = "/tmp/{}.json".format(localkey) with open(jsonFile, 'w') as f: json.dump(parsed, f, indent=4) speakers = {} jsonFile = "/tmp/{}.json".format(localkey) with open(jsonFile, 'r') as f: data = json.load(f) for i in range(len(data)): if data[i]["speaker"] not in speakers: speakers[data[i]["speaker"]] = 1 else: speakers[data[i]["speaker"]] += 1 maxCountSpeaker = max(speakers, key=speakers.get) remove_speakers = [] for speaker, count in speakers.items(): if speaker != maxCountSpeaker: if speakers[speaker] < 3: print(speaker, count) remove_speakers.append(speaker) speakers[speaker] = int(count / 3) for speaker in remove_speakers: if speaker in speakers: del speakers[speaker] data = speakers jsonFile = "/tmp/{}.json".format(localkey) with open(jsonFile, 'w') as f: json.dump(data, f, indent=4) bucket_transcribe = 'surfboard-transcribe' json_upload_file = 'overlap-transcript/{}.json'.format(localkey) s3.upload_file(jsonFile, bucket_transcribe, json_upload_file) return {'statusCode': 200, 'body': json.dumps('Hello from Lambda!')}
'does', 'until', 'her', 'if', "haven't", "shan't", 'she', 'myself', 'such', 'i', 'now', 'how', "it's", 'very', 'off', "that'll", 'too', 'through', "you'd", 'up', 'between', "isn't", 'be', 'again', 'where', 'shan', 'have', 'about', 'when', 'our', 'it', "don't", "aren't", 'didn', 'under', 'mustn', 'me', 'he', 'down', "wouldn't", 'couldn', "hasn't", 'not', 'ours', 'wasn', 'doesn', 'in', 'from', 'yours', "couldn't", 'ma', 'an', 'as', 'why', 'can', 'do', 'what', "you'll", 'themselves', 'during', 'only', "didn't", 're', 'haven', 'are', 'these', 'has', 'my', 'their', 'yourself', 'to', "won't", 'there', 'needn', 'that', 'at', 'were', "hadn't", 'so', 'them', 'no', "needn't", 'being', 'we', 'did', 'should', 'and', 't', 'shouldn', 'm', 'isn', 'had', 'theirs', 'you', 'after', 'once', 'hasn', 'while', "weren't", 'further'] found_stopwords = [] all_fillerwords = ['Uh','Um', 'er', 'ah', 'like', 'okay', 'right,', 'you know','Um.','So,','so,', 'Right?','Uh,','uh,','uh','um,','Um,','um','okay,'] found_fillerwords = [] tscribe.write(file_name,format='csv',save_as='1.csv') df = pd.read_csv('1.csv') result = df.to_json(orient="records") parsed = json.loads(result) with open('temp.json','w') as f: json.dump(parsed,f,indent=4) # Counting stopwords for every row with open('temp.json', 'r') as jsonFile: data = json.load(jsonFile) new_data = {} for i in range(len(data)): word_tokens = word_tokenize(data[i]["comment"])
def lambda_handler(event, context): bucket = event['Records'][0]['s3']['bucket']['name'] key = urllib.parse.unquote_plus(event['Records'][0]['s3']['object']['key'], encoding='utf=8') response = s3.get_object(Bucket=bucket, Key=key) # Read Data of file directly from S3 body = response['Body'].read() res_dict = json.loads(body.decode('utf-8')) jsonData = json.dumps(res_dict) localkey = os.path.basename(os.path.splitext(key)[0]) jsonFile = "/tmp/{}.json".format(localkey) with open(jsonFile, 'w') as f: f.write(jsonData) csvFile = "/tmp/{}.csv".format(localkey) tscribe.write(jsonFile, format='csv', save_as=csvFile) df = pd.read_csv(csvFile) jsonFile = "/tmp/{}.json".format(localkey) result = df.to_json(orient="records") parsed = json.loads(result) with open(jsonFile, 'w') as f: json.dump(parsed, f, indent=4) jsonFile = "/tmp/{}.json".format(localkey) # Counting stopwords for every row with open(jsonFile, 'r') as f: data = json.load(f) new_data = {} for i in range(len(data)): stopwords = [ word for word in data[i]["comment"].split() if word.lower() in nltk_stopwords ] fillerwords = [ word for word in data[i]["comment"].split() if word.lower() in all_fillerwords ] found_stopwords.append(len(stopwords)) found_fillerwords.append(len(fillerwords)) data[i]['stopwords'] = len(stopwords) data[i]['fillerwords'] = len(fillerwords) if len(stopwords) > 0: if data[i]["speaker"] not in new_data: new_data[data[i]["speaker"]] = {} new_data[data[i]["speaker"]]["stopwords"] = {} new_data[data[i]["speaker"]]["fillerwords"] = {} for stopword in stopwords: if stopword not in new_data[data[i] ["speaker"]]["stopwords"]: new_data[data[i] ["speaker"]]["stopwords"][stopword] = 1 else: new_data[data[i] ["speaker"]]["stopwords"][stopword] += 1 else: for stopword in stopwords: if stopword not in new_data[data[i] ["speaker"]]["stopwords"]: new_data[data[i] ["speaker"]]["stopwords"][stopword] = 1 else: new_data[data[i] ["speaker"]]["stopwords"][stopword] += 1 if len(fillerwords) > 0: if data[i]["speaker"] not in new_data: new_data[data[i]["speaker"]] = {} new_data[data[i]["speaker"]]["stopwords"] = {} new_data[data[i]["speaker"]]["fillerwords"] = {} for fillerword in fillerwords: if fillerword not in new_data[ data[i]["speaker"]]["fillerwords"]: new_data[data[i]["speaker"]]["fillerwords"][ fillerword] = 1 else: new_data[data[i]["speaker"]]["fillerwords"][ fillerword] += 1 else: for fillerword in fillerwords: if fillerword not in new_data[ data[i]["speaker"]]["fillerwords"]: new_data[data[i]["speaker"]]["fillerwords"][ fillerword] = 1 else: new_data[data[i]["speaker"]]["fillerwords"][ fillerword] += 1 data = [data, new_data] jsonFile = "/tmp/{}.json".format(localkey) with open(jsonFile, 'w') as f: json.dump(data, f, indent=4) bucket_transcribe = 'surfboard-transcribe' json_upload_file = 'transcript/{}.json'.format(localkey) s3.upload_file(jsonFile, bucket_transcribe, json_upload_file) return {'statusCode': 200, 'body': json.dumps('Hello from Lambda!')}
def test_process(): assert os.access("test_sample.json", os.F_OK), "Input file not found" tscribe.write("test_sample.json", save_as="test_sample.docx") assert os.access("test_sample.docx", os.F_OK), "Output file not found" os.remove("test_sample.docx")