def main(): (root_dir, input_video, training_photos, reuse_trained, tolerance, amp_faces) = sys.argv[1:7] # using output instead of input filename as the latter is unique while the former could be used by multiple jobs logger = MgmLogger(root_dir, "face_recognition", amp_faces) sys.stdout = logger sys.stderr = logger # if tolerance is not specified in command, use the default value if not tolerance: tolerance = FR_DEFAULT_TOLERANCE else: tolerance = float(tolerance) # initialize training results known_names = [] known_faces = [] # if reuse_trained is set to true, retrieve previous training results if reuse_trained.lower() == "true": known_names, known_faces = train.retrieve_trained_results( training_photos) # if no valid previous trained results is available, do the training if (known_names == [] or known_faces == []): known_names, known_faces = train.train_faces(training_photos, root_dir) # run face recognition on the given video using the trained results at the given tolerance level fr_result = recognize_faces(input_video, known_names, known_faces, tolerance) # save the recognized_faces in the standard AMP Face JSON file mgm_utils.write_json_file(fr_result, amp_faces)
def main(): (input_file, input_segmentation_json, remove_type, output_file, kept_segments_file) = sys.argv[1:6] # Turn segmentation json file into segmentation object with open(input_segmentation_json, 'r') as file: seg_data = Segmentation().from_json(json.load(file)) # Remove silence and get a list of kept segments kept_segments = remove_silence(remove_type, seg_data, input_file, output_file) # Write kept segments to json file mgm_utils.write_json_file(kept_segments, kept_segments_file) exit(0)
def main(): (input_file, threshold, output_json, output_csv) = sys.argv[1:5] # Get a list of scenes as tuples (start, end) if threshold is None or isinstance(threshold, int) == False: threshold = 30 print("Setting threshold to default because it wasn't a valid integer") shots = find_shots(input_file, output_csv, threshold) # Print for debugging purposes for shot in shots: print("start: " + str(shot[0]) + " end: " + str(shot[1])) # Convert the result to json, shots_dict = convert_to_json(shots, input_file) # save the output json file mgm_utils.write_json_file(shots_dict, output_json)
def main(): (segmentation_json, adj_json, output_json) = sys.argv[1:4] # Turn adjustment data into list of kept segments with open(adj_json, 'r') as file: adj_data = json.load(file) # Turn segmentation json into objects with open(segmentation_json, 'r') as file: seg = Segmentation().from_json(json.load(file)) # List of adjustments (start, end, adjustment) offset_adj = [] # Last ending position for iterating through kept segments last_end = 0.00 # Running tally of removed segment lengths current_adj = 0.00 # For each segment that was kept, keep track of the gaps to know how much to adjust for kept_segment in adj_data: print(kept_segment + ":" + str(adj_data[kept_segment])) start = float(kept_segment) end = adj_data[kept_segment] # If the start of this segment is after the last end, we have a gap if (start >= last_end): # Keep track of the gap in segments current_adj = current_adj + (start - last_end) # Add it to a list of adjustments offset_adj.append( Adjustment(start - current_adj, end - current_adj, current_adj)) # Keep track of the last segment end last_end = end print("#OFFSET ADJUSTMENTS") for adj in offset_adj: print(str(adj.start) + ":" + str(adj.end) + ":" + str(adj.adjustment)) # For each word, find the corresponding adjustment for segment in seg.segments: adjust_segment(segment, offset_adj) # Write the resulting json mgm_utils.write_json_file(seg, output_json)
def main(): (input_video, azure_video_index, azure_artifact_ocr, amp_vocr) = sys.argv[1:5] # You must initialize logging, otherwise you'll not see debug output. logging.basicConfig() # Get Azure video index json with open(azure_video_index, 'r') as azure_index_file: azure_index_json = json.load(azure_index_file) # Get Azure artifact OCR json with open(azure_artifact_ocr, 'r') as azure_ocr_file: azure_ocr_json = json.load(azure_ocr_file) # Create AMP Video OCR object amp_vocr_obj = create_amp_ocr(input_video, azure_index_json, azure_ocr_json) # write AMP Video OCR JSON file mgm_utils.write_json_file(amp_vocr_obj, amp_vocr)
def convert(media_file, kaldi_file, kaldi_transcript_file, output_json_file): mgm_utils.exception_if_file_not_exist(kaldi_file) if not os.path.exists(kaldi_transcript_file): raise Exception( "Exception: File " + kaldi_transcript_file + " doesn't exist, the previous command generating it must have failed." ) results = SpeechToTextResult() # Open the kaldi json with open(kaldi_file) as json_file: data = json.load(json_file) # Get the kaldi transcript transcript = open(kaldi_transcript_file, "r") results.transcript = transcript.read() # Get a list of words words = data["words"] duration = 0.00 # For each word, add a word to our results for w in words: time = float(w["time"]) end = time + float(w["duration"]) # Keep track of the last time and use it as the duration if end > duration: duration = end results.addWord("", time, end, w["word"], None, None) # Create the media objeect media = SpeechToTextMedia(duration, media_file) # Create the final object outputFile = SpeechToText(media, results) #write the output mgm_utils.write_json_file(outputFile, output_json_file)
def main(): with tempfile.TemporaryDirectory(dir="/tmp") as tmpdir: (input_file, output_name) = sys.argv[1:3] dateTimeObj = datetime.now() #ffmpeg extracts the frames from the video input command = "ffmpeg -i " + input_file + " -an -vf fps=2 '" + tmpdir + "/frame_%05d_" + str( dateTimeObj) + ".jpg'" subprocess.call(command, shell=True) #Tesseract runs the ocr on frames extracted script_start = time.time() #output_name = input_file[:-4]+ "-ocr_"+str(dateTimeObj)+".json" # Get some stats on the video (dim, frameRate, numFrames) = findVideoMetada(input_file) output = { "media": { "filename": input_file, "frameRate": frameRate, "numFrames": numFrames, "resolution": { "width": int(dim[0]), "height": int(dim[1]) } }, "frames": [] } #for every saved frame start_time = 0 for num, img in enumerate(sorted(os.listdir(tmpdir))): start_time = +(.5 * num) frameList = {"start": str(start_time), "objects": []} #Run OCR result = pytesseract.image_to_data(Image.open(tmpdir + "/" + img), output_type=Output.DICT) #For every result, make a box & add it to the list of boxes for this framecalled frameList for i in range(len(result["text"])): if result["text"][i].strip( ): #if the text isn't empty/whitespace box = { "text": result["text"][i], "score": { "type": "confidence", "scoreValue": result["conf"][i] }, # relative coords "vertices": { "xmin": result["left"][i] / output["media"]["resolution"]["width"], "ymin": result["top"][i] / output["media"]["resolution"]["height"], "xmax": (result["left"][i] + result["width"][i]) / output["media"]["resolution"]["width"], "ymax": (result["top"][i] + result["height"][i]) / output["media"]["resolution"]["height"] } } frameList["objects"].append(box) #save frame if it had text if len(frameList["objects"]) > 0: output["frames"].append(frameList) # save the output json file mgm_utils.write_json_file(output, output_name)
def main(): apiUrl = "https://api.videoindexer.ai" (input_file, include_ocr, location, root_dir, index_file, ocr_file) = sys.argv[1:7] try: import http.client as http_client except ImportError: # Python 2 import httplib as http_client config = read_config(root_dir) s3_bucket = config['azure']['s3Bucket'] accountId = config['azure']['accountId'] apiKey = config['azure']['apiKey'] # You must initialize logging, otherwise you'll not see debug output. logging.basicConfig() # Turn on HTTP debugging here http_client.HTTPConnection.debuglevel = 1 s3_path = upload_to_s3(input_file, s3_bucket) print("S3 path " + s3_path) # Get an authorization token for subsequent requests auth_token = get_auth_token(apiUrl, location, accountId, apiKey) video_url = "https://" + s3_bucket + ".s3.us-east-2.amazonaws.com/" + s3_path # Upload the video and get the ID to reference for indexing status and results videoId = upload_video(apiUrl, location, accountId, auth_token, input_file, video_url) # Get the auth token associated with this video # video_auth_token = get_video_auth_token(apiUrl, location, accountId, apiKey, videoId) # Check on the indexing status while True: # The token expires after an hour. Let's just refresh every iteration video_auth_token = get_video_auth_token(apiUrl, location, accountId, apiKey, videoId) state = get_processing_status(apiUrl, location, accountId, videoId, video_auth_token) # We have a status other than uploaded or processing, it is complete if state != "Uploaded" and state != "Processing": break # Wait a bit before checking again time.sleep(60) # Turn on HTTP debugging here http_client.HTTPConnection.debuglevel = 1 # Get the simple video index json auth_token = get_auth_token(apiUrl, location, accountId, apiKey) index_json = get_video_index_json(apiUrl, location, accountId, videoId, auth_token, apiKey) mgm_utils.write_json_file(index_json, index_file) # Get the advanced OCR json via the artifact URL if requested if include_ocr.lower() == 'true': artifacts_url = get_artifacts_url(apiUrl, location, accountId, videoId, auth_token, 'ocr') download_artifacts(artifacts_url, ocr_file) # TODO otherwise do we need to generate a dummy file so the output is not empty and cause error? delete_from_s3(s3_path, s3_bucket)
def main(): (media_file, transcribe_file, output_stt_json_file, output_seg_json_file) = sys.argv[1:5] mgm_utils.exception_if_file_not_exist(transcribe_file) # Open the transcribe output with open(transcribe_file) as json_file: data = json.load(json_file) amp_results = SpeechToTextResult() # Fail if we don't have results if "results" not in data.keys(): exit(1) aws_results = data["results"] if "transcripts" not in aws_results.keys(): exit(1) # Parse transcript transcripts = aws_results["transcripts"] for t in transcripts: amp_results.transcript = amp_results.transcript + t["transcript"] # Fail if we don't have any keys if "items" not in aws_results.keys(): exit(1) # Parse items (words) items = aws_results["items"] duration = 0.00 # For each item, get the necessary parts and store as a word for i in items: alternatives = i["alternatives"] # Choose an alternative max_confidence = 0.00 text = "" # Each word is stored as an "alternative". Get the one with the maximum confidence for a in alternatives: if float(a["confidence"]) >= max_confidence: max_confidence = float(a["confidence"]) text = a["content"] end_time = -1 start_time = -1 # Two types (punctionation, pronunciation). Only keep times for pronunciation if i["type"] == "pronunciation": end_time = float(i["end_time"]) start_time = float(i["start_time"]) # If this is the greatest end time, store it as duration if end_time > duration: duration = end_time # Add the word to the results amp_results.addWord(i["type"], start_time, end_time, text, "confidence", max_confidence) # Create the media object media = SpeechToTextMedia(duration, media_file) # Create the final object outputFile = SpeechToText(media, amp_results) # Write the output mgm_utils.write_json_file(outputFile, output_stt_json_file) # Start segmentation schema with diarization data # Create a segmentation object to serialize seg_schema = Segmentation() # Create the media object segMedia = SegmentationMedia(duration, media_file) seg_schema.media = segMedia if "speaker_labels" in aws_results.keys(): speakerLabels = aws_results["speaker_labels"] seg_schema.numSpeakers = speakerLabels["speakers"] # For each segment, get the start time, end time and speaker label segments = speakerLabels["segments"] for segment in segments: seg_schema.addDiarizationSegment(float(segment["start_time"]), float(segment["end_time"]), segment["speaker_label"]) # Write the output mgm_utils.write_json_file(seg_schema, output_seg_json_file)
def main(): (input_file, json_file, bucketName, dataAccessRoleArn) = sys.argv[1:5] # Read a list of categories to ignore when outputting entity list ignore_cats_list = list() if len(sys.argv) > 5: print("ignore cats:" + sys.argv[5]) ignore_cats_list = split_ignore_list(sys.argv[5]) # Variable declaration outputS3Uri = 's3://' + bucketName + '/' timestamp = datetime.now().strftime("%Y%m%d%H%M%S") jobName = 'AwsComprehend-' + timestamp + ".json" inputS3Uri = outputS3Uri + jobName # Get the transcript text from the input file with open(input_file, 'r') as file: stt = SpeechToText().from_json(json.load(file)) # Create the ner object ner = EntityExtraction() # Add the media information if stt is None or stt.results is None: mediaLength = 0 else: mediaLength = len(stt.results.transcript) # If we have a blank file, don't error. Create another blank json file to pass to the next process if mediaLength == 0: ner.media = EntityExtractionMedia(mediaLength, input_file) mgm_utils.write_json_file(ner, json_file) exit(0) # Create a temp file to upload to S3 tmpfile = create_temp_transcript_file(jobName, stt.results.transcript) # Copy the temporary text file to S3 copy_to_s3(tmpfile.name, bucketName, jobName) # Make call to aws comprehend output_uri = run_comprehend_job(jobName, inputS3Uri, outputS3Uri, dataAccessRoleArn) uncompressed_file = download_from_s3(output_uri, outputS3Uri, bucketName) if uncompressed_file is None: exit(1) comprehend_data = read_comprehend_response(uncompressed_file) ner.media = EntityExtractionMedia(mediaLength, input_file) # Variables for filling time offsets based on speech to text lastPos = 0 # Iterator to keep track of location in STT word sttWords = len(stt.results.words) # Number of STT words if 'Entities' in comprehend_data.keys(): for entity in comprehend_data["Entities"]: entity_type = entity["Type"] # Start and end time offsets start = None end = None text = entity["Text"] # Split the entity into an array of words based on whitespace entityParts = text.split() # For each word in the entity, find the corresponding word in the STT word list foundWordPos = None for entityPart in entityParts: for wordPos in range(lastPos, sttWords): # If it matches, set the time offset. word = stt.results.words[wordPos] if clean_entity_word( word.text) == clean_entity_word(entityPart): # Keep track of last position to save iterations foundWordPos = wordPos # Set start if we haven't set it yet if start is None: start = word.start end = word.end break else: start = None end = None foundWordPos = None if start is not None: lastPos = foundWordPos else: print("Could not find word") print(text) print(entityParts) print(lastPos) if clean_text( entity_type) not in ignore_cats_list and start is not None: ner.addEntity( entity_type, text, None, None, "relevance", float(entity["Score"]), start, None) #AMP-636 removed startOffset=endOffset=end=None #Write the json file mgm_utils.write_json_file(ner, json_file) #Cleanup temp files safe_delete(uncompressed_file) safe_delete(tmpfile.name)
def main(): """ Submit a job to run ina speech segmenter on HPC """ parser = argparse.ArgumentParser(description=main.__doc__) parser.add_argument("--debug", default=False, action="store_true", help="Turn on debugging") parser.add_argument("root_dir", help="Galaxy root directory") parser.add_argument("input", help="input audio file") parser.add_argument("segments", help="INA Speech Segmenter output") parser.add_argument("amp_segments", help="AMP Segmentation Schema output") parser.add_argument("hpc_timestamps", help="HPC Timestamps output") args = parser.parse_args() # set up logging logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO, stream=sys.stderr, format="%(asctime)s %(levelname)s %(message)s") config = mgm_utils.get_config(args.root_dir) dropbox = config["hpc"]["dropbox"] # job parameters job = { 'script': 'ina_speech_segmenter', 'input_map': { 'input': args.input }, 'output_map': { 'segments': args.segments } } print("Submitting job to HPC") job = hpc_submit.submit_and_wait(dropbox, job) print("Checking job status: " + job['job']['status']) if job['job']['status'] != 'ok': exit(1) print("Reading TSV into list of tuples") with open(args.segments, 'r') as csvin: data = [tuple(line) for line in csv.reader(csvin, delimiter='\t')] print("Converting ina output to segmentation schema") # Convert the resulting list of tuples to an object for serialization seg_schema = convert_to_segmentation_schema(args.input, data) print("Writing output json") # Serialize the json and write it to destination file mgm_utils.write_json_file(seg_schema, args.amp_segments) print("Job output:") print(job) # Write the hpc timestamps output if "start" in job['job'].keys() and "end" in job['job'].keys(): ts_output = { "start_time": job['job']["start"], "end_time": job['job']["end"], "elapsed_time": (datetime.strptime(job['job']["end"], '%Y-%m-%d %H:%M:%S.%f') - datetime.strptime(job['job']["start"], '%Y-%m-%d %H:%M:%S.%f')).total_seconds() } mgm_utils.write_json_file(ts_output, args.hpc_timestamps) exit(0)
def main(): (root_dir, from_draftjs, original_transcript, to_transcript) = sys.argv[1:5] # using output instead of input filename as the latter is unique while the former could be used by multiple jobs logger = MgmLogger(root_dir, "hmgm_transcript", to_transcript) sys.stdout = logger sys.stderr = logger try: # if from_draftjs is in error raise exception to notify HMGM job runner to fail the job # otherwise if from_draftjs doesn't exist yet, exit 1 to keep waiting mgm_utils.exit_if_file_not_ready(from_draftjs) print("Converting DraftJs " + from_draftjs + " to Transcript " + to_transcript) with open(from_draftjs) as json_file: d = json.load(json_file) data = eval(json.dumps(d)) #read original file for extracting only the confidence score of each word original_input = open(original_transcript) original_json = json.loads(original_input.read()) original_items = original_json["results"]["words"] #print("the data in editor output is:",data) results = SpeechToTextResult() word_type = text = '' confidence = start_time = end_time = -1 duration = 0.0 # draftJS input file here always came from converted and corrected AMP Transcript, # so it should always contain 'entityMap', otherwise error should occur #Standardising draft js format # if "entityMap" in data.keys(): transcript = '' entityMap = data["entityMap"] for i in range(0, len(entityMap.keys())): punctuation = '' if str(i) not in entityMap.keys(): continue entity = entityMap[str(i)] if "data" in entity: if "text" in entity["data"].keys(): text = entity["data"]["text"] transcript += entity["data"]["text"] + " " if text[-1] in string.punctuation: #[',','.','!','?']: punctuation = text[-1] text = text[0:-1] if "type" in entity: entity_type = entity["type"] if entity_type == "WORD": word_type = "pronunciation" if "start" in entity["data"]: start_time = float(entity["data"]["start"]) if "end" in entity["data"]: end_time = float(entity["data"]["end"]) if end_time > duration: duration = end_time else: word_type = entity_type results.addWord(word_type, start_time, end_time, text, "confidence", confidence) if len(punctuation) > 0: results.addWord('punctuation', None, None, punctuation, "confidence", 0.0) results.transcript = transcript words = results.words #Now retrieving the confidence values from the original input file and assigning them to 'results' list_items = [] list_result = [] for i in range(0, len(original_items)): list_items.append(original_items[i]["text"]) for j in range(0, len(words)): list_result.append(words[j].text) d = difflib.Differ() res = list(d.compare(list_items, list_result)) i = j = 0 word_count = len(words) original_item_count = len(original_items) print("original item count: " + str(original_item_count)) print("word count: " + str(word_count)) for ele in res: if j >= word_count or i >= original_item_count: break elif ele.startswith("- "): i += 1 elif len(ele) > 2 and ele[0:2] == "+ ": words[j].score.scoreValue = 1.0 j += 1 elif ele[0:1] == " " and words[j].text == original_items[i]["text"]: if ("score" in original_items[i]): words[j].score.scoreValue = float( original_items[i]["score"]["scoreValue"]) else: words[ j].score.scoreValue = 1.0 # default score to 1.0 if not existing originally i += 1 j += 1 print("i: " + str(i) + " j:" + str(j)) # Create the media object media = SpeechToTextMedia(duration, original_transcript) # Create the final object stt = SpeechToText(media, results) # Write the output mgm_utils.write_json_file(stt, to_transcript) print("Successfully converted from DraftJs " + from_draftjs + " to Transcript " + to_transcript) # as the last command in HMGM, implicitly exit 0 here to let the whole job complete in success except Exception as e: # as the last command in HMGM, exit -1 to let the whole job fail print( "Failed to convert from DraftJs " + from_draftjs + " to Transcript " + to_transcript, e) traceback.print_exc() sys.stdout.flush() exit(-1)
def write_amp_json(temp_gentle_output, original_transcript, amp_transcript_output): # Create the amp transcript output = dict() with open(temp_gentle_output, "r") as gentle_output_file: gentle_output = json.load(gentle_output_file) output["media"] = original_transcript["media"] output["results"] = dict() output["results"]["transcript"] = original_transcript["results"]["transcript"] output["results"]["words"] = list() previous_end = 0 last_success_index = 0 for word in gentle_output["words"]: # Make sure we have all the data if word["case"] == 'success': previous_end = word["end"] output["results"]["words"].append( { "type": "pronunciation", "start": word["start"], "end": word["end"], "text": word["word"], "score": { "type": "confidence", "scoreValue": 1.0 } } ) else: word_index = gentle_output["words"].index(word) next_success_index = find_next_success(gentle_output, word_index) avg_time = 0 # If we found another success if(next_success_index > word_index): # Average the times based on how many words in between next_success_word = gentle_output["words"][next_success_index] skips_ahead = (next_success_index - last_success_index) avg_time = (next_success_word["start"] - previous_end)/skips_ahead print("Averaging time from next success") else: duration = original_transcript["results"]["duration"] skips_ahead = (len(gentle_output["words"]) - word_index) + 1 avg_time = (duration - previous_end)/skips_ahead print("Averaging time from end of file") # From the previous words end (last recorded), skip time ahead time = previous_end + avg_time previous_end = time print(word["word"] + " at index " + str(word_index)) print("Avg_time " + str(avg_time) + " Skips ahead " + str(skips_ahead)) # Add the word to the results output["results"]["words"].append( { "type": "pronunciation", "start": time, "end": time, "text": word["word"], "score": { "type": "confidence", "scoreValue": 1.0 } } ) last_success_index = gentle_output["words"].index(word) mgm_utils.write_json_file(output, amp_transcript_output)