def create_manifest(content_id, path_to_saved_folder): manifest = {} path_to_manifest = "" try: if os.path.exists(path_to_saved_folder): path_to_manifest = os.path.join(path_to_saved_folder, "manifest.json") manifest["source"] = {"name": content_id} pdf_list = findFiles(path_to_saved_folder, ["pdf"]) arr = [] for i in pdf_list: arr.append({"id": content_id, "path": i}) manifest["source"]["path"] = arr arr = [] for i in findFiles(path_to_saved_folder, ["txt"]): arr.append({"id": content_id, "path": i, "Type": "gocr"}) manifest["extract"] = {} manifest["extract"]["fulltextAnnotation"] = arr arr = [] for i in (os.listdir(os.path.join(path_to_saved_folder, "raw_data"))): if i != '.DS_Store': arr.append({"id": content_id+"_blob_gocr", "path": i, "Type": "gocr"}) manifest["extract"]["api_response"] = arr with open(path_to_manifest, "w") as json_file: json.dump(manifest, json_file, indent=4) else: print("path doesnot exist!") except: print("Error in manifest file creation") return path_to_manifest
def speech_to_text(method, path_to_assets, GOOGLE_APPLICATION_CREDENTIALS): logging.info("STT_START") text = "" if not os.path.exists(path_to_assets): logging.info("No audio file detected") else: audio_names = findFiles(path_to_assets, ['mp3']) if method == "googleAT" and len(audio_names) > 0: try: for i in audio_names: logging.info("STT_AUDIO_FILEPATH: {0}".format( os.path.join(path_to_assets, i))) path_to_split = audio_split( os.path.join(path_to_assets, i), os.path.join(path_to_assets, "audio_split")) logging.info("STT_AUDIO_SPLIT: {0}".format(path_to_split)) text += audio_to_text(path_to_split, GOOGLE_APPLICATION_CREDENTIALS) except BaseException: text = "" elif method == "none": logging.info("STT_NOT_PERFORMED") else: logging.info("Unknown method given") logging.info("STT_STOP") text_dict = {"text": text} return text_dict
def video_to_speech(method, path_to_assets): logging.info('VTS_START') video_names = findFiles(path_to_assets, ['mp4', 'webm']) logging.info('...detected {0} video files'.format(str(len(video_names)))) if method == "ffmpeg" and len(video_names) > 0: logging.info("VTS_START_FOR_METHOD: {0}".format(method)) for file in video_names: # ffmpy wrapper to convert mp4 to mp3: if "webm" in file: ff = ffmpy.FFmpeg(inputs={file: None}, outputs={ os.path.join(file[:-5] + ".mp3"): '-vn -ar 44100 -ac 2 -ab 192 -f mp3' }) else: ff = ffmpy.FFmpeg(inputs={file: None}, outputs={ os.path.join(file[:-4] + ".mp3"): '-vn -ar 44100 -ac 2 -ab 192 -f mp3' }) ff.run() if os.path.exists(os.path.join(path_to_assets, file[:-4] + ".mp3")): path_to_audio = os.path.join(path_to_assets, file[:-4] + ".mp3") logging.info("VTS_AUDIO_DOWNLOAD_PATH: ".format(path_to_audio)) else: logging.info("mp3 download unsuccessful") if method == "none": logging.info("No Video content detected") logging.info('VTS_STOP') return path_to_assets
def run(self): timestamp_folder = self.inputs["timestamp_folder"].read() timestr = os.path.split(timestamp_folder)[1] epoch_time = time.mktime(time.strptime(timestr, "%Y%m%d-%H%M%S")) es_request = requests.get('http://localhost:9200') content_to_textpath = os.path.join(timestamp_folder, "content_to_text") cid_name = [ i for i in os.listdir(content_to_textpath) if i not in ['.DS_Store'] ] for cid in cid_name: merge_json_list = [] json_file = findFiles(os.path.join(content_to_textpath, cid), ["json"]) logging.info("json_files are: ", json_file) for file in json_file: if os.path.split(file)[1] in [ "ML_keyword_info.json", "ML_content_info.json" ]: merge_json_list.append(file) autotagging_json = merge_json(merge_json_list) autotagging_json.update({"ETS": epoch_time}) elastic_search = Elasticsearch([{ 'host': 'es', 'port': 9200 }]) #change it to localhost if es_request.status_code == 200: elastic_search.index(index="auto_tagging", doc_type='content_id_info', id=cid, body=autotagging_json)
def getblob(method_of_ocr, bucket_name, local_path_to_pdf, content_id, root_path): path_to_outputjson_folder = "" if method_of_ocr == "GOCR": print("----Performing GoogleOCR Text extraction----") try: pdf_name = content_id # os.path.split(local_path_to_pdf)[1][:-4] textbook_model_path = os.path.join(root_path, pdf_name) print(pdf_name, textbook_model_path) if not os.path.exists(textbook_model_path): os.makedirs(textbook_model_path) location = [ os.path.join(textbook_model_path, folder) for folder in ['source', 'extract', 'raw_data'] ] for loc in location: if not os.path.exists(loc): os.makedirs(loc) shutil.copy(local_path_to_pdf, os.path.join(textbook_model_path, "source")) gcs_source_uri = upload_blob(bucket_name, local_path_to_pdf, pdf_name + ".pdf") if gcs_source_uri: # perform GoogleOCR: gcs_destination_uri = "gs://{0}/{1}".format( bucket_name, os.path.split(gcs_source_uri)[1][:-4] + "/") print(gcs_destination_uri) prefix, all_text = do_GoogleOCR(gcs_source_uri, gcs_destination_uri) path_to_gocr_text = os.path.join(textbook_model_path, "extract", "GOCR", "text") path_to_gocr_json = os.path.join(textbook_model_path, "raw_data") if not os.path.exists(path_to_gocr_text): os.makedirs(path_to_gocr_text) with open(os.path.join(path_to_gocr_text, prefix + ".txt"), "w") as text_file: text_file.write(all_text) #concatenate multiple text file if any: textnames = findFiles(path_to_gocr_text, ["txt"]) with open( os.path.join(path_to_gocr_text, "fulltext_annotation" + ".txt"), 'w') as outfile: for fname in textnames: with open(fname) as infile: for line in infile: outfile.write(line) os.remove(fname) path_to_outputjson_folder = download_outputjson_reponses( bucket_name, prefix + "/", path_to_gocr_json, delimiter="/") except: print("Process terminated") return textbook_model_path
def pdf_to_text(method, path_to_assets, pdf_url): """ """ text = "" number_of_pages = 0 logging.info("PTT_START") pdf_names = findFiles(path_to_assets, ['.pdf']) print("----->pdf_names: ", pdf_names) if method == "PyPDF2": logging.info("PTT_METHOD: {0}".format(method)) for j in range(0, len(pdf_names) + 1): if (len(pdf_names) == 0 and pdf_url.endswith('pdf')): r = requests.get(pdf_url) f = io.BytesIO(r.content) read_pdf = PdfFileReader(f) number_of_pages = read_pdf.getNumPages() elif j < (len(pdf_names)): pdf_files = pdf_names[j] f = open(pdf_files, 'rb') read_pdf = PdfFileReader(f) number_of_pages = read_pdf.getNumPages() else: number_of_pages = 0 if method == "pdfminer": logging.info("PTT_METHOD: {0}".format(method)) text = "" for j in range(0, len(pdf_names) + 1): if (len(pdf_names) == 0 and pdf_url.endswith('pdf')): r = requests.get(pdf_url) f = io.BytesIO(r.content) read_pdf = PdfFileReader(f) number_of_pages = read_pdf.getNumPages() elif j < (len(pdf_names)): pdf_files = pdf_names[j] text = "" text = convert_pdf_to_txt(pdf_files) number_of_pages = 0 else: number_of_pages = 0 if method == "none": logging.info("PDF_NOT_PERFORMED") if number_of_pages > 0: for i in range(number_of_pages): page = read_pdf.getPage(i) page_content = page.extractText() text += page_content processed_txt = clean_text(text) text = ''.join([i for i in processed_txt if not i.isdigit()]) text = ' '.join(text.split()) logging.info("PTT_STOP") text_dict = {"text": text, "no_of_pages": number_of_pages} return text_dict
def run(self, write_to_kafkaTopic): path_to_contentKeywords = self.inputs["path_to_contentKeywords"].read() pathTocredentials = self.inputs["pathTocredentials"].read_loc() timestamp_folder = os.path.split(path_to_contentKeywords)[0] timestr = os.path.split(timestamp_folder)[1] epoch_time = time.mktime(time.strptime(timestr, "%Y%m%d-%H%M%S")) content_to_textpath = os.path.join(timestamp_folder, "content_to_text") cid_name = [ i for i in os.listdir(content_to_textpath) if i not in ['.DS_Store'] ] for cid in cid_name: merge_json_list = [] json_file = findFiles(os.path.join(content_to_textpath, cid), ["json"]) for file in json_file: if os.path.split(file)[1] in [ "ML_keyword_info.json", "ML_content_info.json" ]: merge_json_list.append(file) ignore_list = ["ets"] dict_list = [] for file in merge_json_list: with open(file, "r", encoding="UTF-8") as info: new_json = json.load(info) [ new_json.pop(ignore) for ignore in ignore_list if ignore in new_json.keys() ] dict_list.append(new_json) # merge the nested jsons:- autotagging_json = reduce(merge_json, dict_list) autotagging_json.update({"ets": epoch_time}) with open( os.path.join(timestamp_folder, "content_to_text", cid, "autoTagging_json.json"), "w+") as main_json: json.dump(autotagging_json, main_json, sort_keys=True, indent=4) # writing to kafka topic:- kafka_cli = KafkaCLI(pathTocredentials) status = kafka_cli.write(autotagging_json, write_to_kafkaTopic) if status: logging.info( "******Transaction event successfully pushed to topic:{0}". format(write_to_kafkaTopic)) else: logging.info("******Error pushing the event") # Remove the timestamp folder:- shutil.rmtree(timestamp_folder)
def ecar_unzip(download_location, copy_location): #ekstep_ecar_unzip """ This function unzips an ecar file(ekstep file format) and parses all the subfolder. All the files are copied into one of ``'assets','data','items'`` folder (same name as in downloaded folder is maintained) based on its location in the downloaded folder. :param download_location(str): A location in the disk where ekstep ecar resource file is downloaded :param copy_location(str): A disk location where the ecar is unwrapped """ assert isinstance(download_location, str) assert isinstance(copy_location, str) if not os.path.exists(copy_location): os.makedirs(copy_location) #To make the new sub-directories in which the files will be eventually stored location = [ os.path.join(copy_location, folder) for folder in ['assets', 'data', 'items'] ] for loc in location: if not os.path.exists(loc): os.makedirs(loc) ecar_extensions = [ 'png', 'gif', 'jpg', 'mp4', 'webm', 'pdf', 'mp3', 'ecml' ] files_found = findFiles(download_location, ecar_extensions) if files_found: for file in files_found: if file[-4:] in "ecml": shutil.copy(file, copy_location) else: shutil.copy(file, os.path.join(copy_location, "assets")) else: print("No files to copy!") # Delete the messy download directory if os.path.exists(download_location): shutil.rmtree(download_location)
def image_to_text(method, path_to_assets): logging.info("ITT_START") image_text = "" image_names = findFiles(path_to_assets, ['png', 'gif', 'jpg']) if method == "googleVision" and len(image_names) > 0: try: logging.info('...detected {0} video files'.format( str(len(image_names)))) logging.info('...image file processing started') for file in image_names: try: image_text += getImgTags(file) except BaseException: print('........ Error: could not process file') print("Text: ", image_text) text = list(str(image_text.lower()).split("\n")) image_text = ' '.join(list(set(text))) except BaseException: image_text = "" if method == "none": logging.info("ITT_NOT_PERFORMED") logging.info("ITT_STOP") text_dict = {"text": image_text} return text_dict
def multimodal_text_enrichment(index, timestr, content_meta, content_type, content_to_text_path, GOOGLE_APPLICATION_CREDENTIALS): """ A custom function to extract text from a given Content id in a Content meta dataframe extracted using Content V2 api Parameters ---------- :param index(int): row id for the Content :param content_meta(dataframe): A dataframe of Content metadata. Mandatory fields for content_meta are ``['artifactUrl', 'content_type','downloadUrl', 'gradeLevel', 'identifier','keywords', 'language', 'subject']`` :param content_type(str): Can be ``youtube, pdf, ecml, unknown`` :param content_to_text_path(str): path to save the extracted text :returns: Path where text is saved """ type_of_url = content_meta.iloc[index]["derived_contentType"] id_name = content_meta["identifier"][index] downloadField = content_type[type_of_url]["contentDownloadField"] url = content_meta[downloadField][index] logging.info("MTT_START_FOR_INDEX {0}".format(index)) logging.info("MTT_START_FOR_CID {0}".format(id_name)) logging.info("MTT_START_FOR_URL {0}".format(url)) # start text extraction pipeline: try: start = time.time() path_to_id = download_content(type_of_url, url, content_to_text_path, id_name) print("path_to_id", path_to_id) path_to_assets = os.path.join(path_to_id, "assets") if type_of_url != "pdf": path_to_audio = video_to_speech( content_type[type_of_url]["video_to_speech"], path_to_assets) print(path_to_audio) if len(findFiles(path_to_assets, ["mp3"])) > 0: audio = AudioSegment.from_mp3( findFiles(path_to_assets, ["mp3"])[0]) duration = round(len(audio) / 1000) else: duration = 0 textExtraction_pipeline = [ (speech_to_text, (content_type[type_of_url]["speech_to_text"], path_to_assets, GOOGLE_APPLICATION_CREDENTIALS)), (image_to_text, (content_type[type_of_url]["image_to_text"], path_to_assets)), (pdf_to_text, (content_type[type_of_url]["pdf_to_text"], path_to_assets, url)), (ecml_index_to_text, (content_type[type_of_url]["ecml_index_to_text"], path_to_id)) ] path_to_transcript = os.path.join(path_to_id, "enriched_text.txt") text = "" for method, param_tuple in textExtraction_pipeline: text += method(*param_tuple)["text"] # Adding description and title to the text only for PDF content if type_of_url == "pdf": text += content_meta["name"].iloc[index] + " " + content_meta[ "description"].iloc[index] if os.path.exists(path_to_id) and text: with open(path_to_transcript, "w") as myTextFile: myTextFile.write(text) # num_of_PDFpages = pdf_to_text("none", path_to_assets, url)["no_of_pages"] # Reading pdata airflow_home = os.getenv('AIRFLOW_HOME', os.path.expanduser('~/airflow')) dag_location = os.path.join(airflow_home, 'dags') print("AIRFLOW_HOME: ", dag_location) filename = os.path.join(dag_location, 'graph_location') f = open(filename, "r") lines = f.read().splitlines() pdata = lines[-1] f.close() # estimating ets: epoch_time = time.mktime(time.strptime(timestr, "%Y%m%d-%H%M%S")) domain = content_meta["subject"][index] object_type = content_meta["objectType"][index] template = "" plugin_used = [] num_of_stages = 0 # only for type ecml if type_of_url == "ecml": plugin_used = ecml_index_to_text("parse", path_to_id)["plugin_used"] num_of_stages = ecml_index_to_text("parse", path_to_id)["num_stage"] mnt_output_dict_new = { "ets": int(epoch_time), #Event generation time in epoch "nodeUniqueId": id_name, #content id "operationType": "UPDATE", #default to UPDATE "nodeType": "DATA_NODE", #default to DATA_NODE "graphId": domain, #default to domain "objectType": object_type, #object type - content, worksheet, textbook, collection etc "nodeGraphId": 0, #default to 0 "transactionData": { "properties": { "tags": { "system_contentType": type_of_url, #can be "youtube", "ecml", "pdf" "system_medium": language_detection( text ), #generated using google language detection api "duration": { "video": "", #video duration in seconds "stage": "" #can be derived from usage data }, "num_stage": num_of_stages, #pdf: number of pages, ecml:number of stages, video:1 "system_plugins": plugin_used, #id's of plugin used in Content "system_templates": template, #id's of templates used in Content "text": text, }, "version": pdata, #yaml version "uri": "" #git commit id } } } with open(os.path.join(path_to_id, "ML_content_info.json"), "w") as info: mnt_json_dump = json.dump(mnt_output_dict_new, info, indent=4) # sort_keys=True, print(mnt_json_dump) stop = time.time() time_consumed = stop - start time_consumed_minutes = time_consumed / 60.0 print("time taken in sec for text enrichment for cid -----> {0} : {1}". format(id_name, time_consumed)) print( "time taken in minutes for text enrichment for cid -----> {0} : {1}" .format(id_name, time_consumed_minutes)) logging.info( "MTT_TRANSCRIPT_PATH_CREATED: {0}".format(path_to_transcript)) logging.info("MTT_CONTENT_ID_READ: {0}".format(id_name)) logging.info("MTT_STOP_FOR_URL {0}".format(url)) return os.path.join(path_to_id, "ML_content_info.json") except BaseException: logging.info("TextEnrichment failed for url:{0} with id:{1}".format( url, id_name))