def create_manifest(content_id, path_to_saved_folder):
    manifest = {}
    path_to_manifest = ""
    try:
        if os.path.exists(path_to_saved_folder):
            path_to_manifest = os.path.join(path_to_saved_folder, "manifest.json")
            manifest["source"] = {"name": content_id}

            pdf_list = findFiles(path_to_saved_folder, ["pdf"])
            arr = []
            for i in pdf_list:
                arr.append({"id": content_id, "path": i})
            manifest["source"]["path"] = arr
            arr = []
            for i in findFiles(path_to_saved_folder, ["txt"]):
                arr.append({"id": content_id, "path": i, "Type": "gocr"})

            manifest["extract"] = {}
            manifest["extract"]["fulltextAnnotation"] = arr
            arr = []
            for i in (os.listdir(os.path.join(path_to_saved_folder, "raw_data"))):
                if i != '.DS_Store':
                    arr.append({"id": content_id+"_blob_gocr", "path": i, "Type": "gocr"})

            manifest["extract"]["api_response"] = arr
            with open(path_to_manifest, "w") as json_file:
                json.dump(manifest, json_file, indent=4)
        else:
            print("path doesnot exist!")
    except:
        print("Error in manifest file creation")
    return path_to_manifest
Ejemplo n.º 2
0
def speech_to_text(method, path_to_assets, GOOGLE_APPLICATION_CREDENTIALS):
    logging.info("STT_START")
    text = ""
    if not os.path.exists(path_to_assets):
        logging.info("No audio file detected")
    else:
        audio_names = findFiles(path_to_assets, ['mp3'])
        if method == "googleAT" and len(audio_names) > 0:
            try:
                for i in audio_names:
                    logging.info("STT_AUDIO_FILEPATH: {0}".format(
                        os.path.join(path_to_assets, i)))
                    path_to_split = audio_split(
                        os.path.join(path_to_assets, i),
                        os.path.join(path_to_assets, "audio_split"))
                    logging.info("STT_AUDIO_SPLIT: {0}".format(path_to_split))
                    text += audio_to_text(path_to_split,
                                          GOOGLE_APPLICATION_CREDENTIALS)
            except BaseException:
                text = ""
        elif method == "none":
            logging.info("STT_NOT_PERFORMED")
        else:
            logging.info("Unknown method given")
    logging.info("STT_STOP")
    text_dict = {"text": text}
    return text_dict
Ejemplo n.º 3
0
def video_to_speech(method, path_to_assets):
    logging.info('VTS_START')
    video_names = findFiles(path_to_assets, ['mp4', 'webm'])
    logging.info('...detected {0} video files'.format(str(len(video_names))))
    if method == "ffmpeg" and len(video_names) > 0:
        logging.info("VTS_START_FOR_METHOD: {0}".format(method))

        for file in video_names:
            # ffmpy wrapper to convert mp4 to mp3:
            if "webm" in file:
                ff = ffmpy.FFmpeg(inputs={file: None},
                                  outputs={
                                      os.path.join(file[:-5] + ".mp3"):
                                      '-vn -ar 44100 -ac 2 -ab 192 -f mp3'
                                  })
            else:
                ff = ffmpy.FFmpeg(inputs={file: None},
                                  outputs={
                                      os.path.join(file[:-4] + ".mp3"):
                                      '-vn -ar 44100 -ac 2 -ab 192 -f mp3'
                                  })
            ff.run()
            if os.path.exists(os.path.join(path_to_assets,
                                           file[:-4] + ".mp3")):
                path_to_audio = os.path.join(path_to_assets,
                                             file[:-4] + ".mp3")
                logging.info("VTS_AUDIO_DOWNLOAD_PATH: ".format(path_to_audio))
            else:
                logging.info("mp3 download unsuccessful")
    if method == "none":
        logging.info("No Video content detected")
    logging.info('VTS_STOP')
    return path_to_assets
 def run(self):
     timestamp_folder = self.inputs["timestamp_folder"].read()
     timestr = os.path.split(timestamp_folder)[1]
     epoch_time = time.mktime(time.strptime(timestr, "%Y%m%d-%H%M%S"))
     es_request = requests.get('http://localhost:9200')
     content_to_textpath = os.path.join(timestamp_folder, "content_to_text")
     cid_name = [
         i for i in os.listdir(content_to_textpath)
         if i not in ['.DS_Store']
     ]
     for cid in cid_name:
         merge_json_list = []
         json_file = findFiles(os.path.join(content_to_textpath, cid),
                               ["json"])
         logging.info("json_files are: ", json_file)
         for file in json_file:
             if os.path.split(file)[1] in [
                     "ML_keyword_info.json", "ML_content_info.json"
             ]:
                 merge_json_list.append(file)
         autotagging_json = merge_json(merge_json_list)
         autotagging_json.update({"ETS": epoch_time})
         elastic_search = Elasticsearch([{
             'host': 'es',
             'port': 9200
         }])  #change it to localhost
         if es_request.status_code == 200:
             elastic_search.index(index="auto_tagging",
                                  doc_type='content_id_info',
                                  id=cid,
                                  body=autotagging_json)
Ejemplo n.º 5
0
def getblob(method_of_ocr, bucket_name, local_path_to_pdf, content_id,
            root_path):
    path_to_outputjson_folder = ""
    if method_of_ocr == "GOCR":
        print("----Performing GoogleOCR Text extraction----")
        try:
            pdf_name = content_id  # os.path.split(local_path_to_pdf)[1][:-4]
            textbook_model_path = os.path.join(root_path, pdf_name)
            print(pdf_name, textbook_model_path)
            if not os.path.exists(textbook_model_path):
                os.makedirs(textbook_model_path)
            location = [
                os.path.join(textbook_model_path, folder)
                for folder in ['source', 'extract', 'raw_data']
            ]
            for loc in location:
                if not os.path.exists(loc):
                    os.makedirs(loc)
            shutil.copy(local_path_to_pdf,
                        os.path.join(textbook_model_path, "source"))
            gcs_source_uri = upload_blob(bucket_name, local_path_to_pdf,
                                         pdf_name + ".pdf")
            if gcs_source_uri:
                # perform GoogleOCR:
                gcs_destination_uri = "gs://{0}/{1}".format(
                    bucket_name,
                    os.path.split(gcs_source_uri)[1][:-4] + "/")
                print(gcs_destination_uri)
                prefix, all_text = do_GoogleOCR(gcs_source_uri,
                                                gcs_destination_uri)
                path_to_gocr_text = os.path.join(textbook_model_path,
                                                 "extract", "GOCR", "text")
                path_to_gocr_json = os.path.join(textbook_model_path,
                                                 "raw_data")
                if not os.path.exists(path_to_gocr_text):
                    os.makedirs(path_to_gocr_text)

                with open(os.path.join(path_to_gocr_text, prefix + ".txt"),
                          "w") as text_file:
                    text_file.write(all_text)
                #concatenate multiple text file if any:
                textnames = findFiles(path_to_gocr_text, ["txt"])
                with open(
                        os.path.join(path_to_gocr_text,
                                     "fulltext_annotation" + ".txt"),
                        'w') as outfile:
                    for fname in textnames:
                        with open(fname) as infile:
                            for line in infile:
                                outfile.write(line)
                            os.remove(fname)
                path_to_outputjson_folder = download_outputjson_reponses(
                    bucket_name,
                    prefix + "/",
                    path_to_gocr_json,
                    delimiter="/")
        except:
            print("Process terminated")
    return textbook_model_path
Ejemplo n.º 6
0
def pdf_to_text(method, path_to_assets, pdf_url):
    """
    """
    text = ""
    number_of_pages = 0
    logging.info("PTT_START")
    pdf_names = findFiles(path_to_assets, ['.pdf'])
    print("----->pdf_names: ", pdf_names)
    if method == "PyPDF2":
        logging.info("PTT_METHOD: {0}".format(method))
        for j in range(0, len(pdf_names) + 1):
            if (len(pdf_names) == 0 and pdf_url.endswith('pdf')):
                r = requests.get(pdf_url)
                f = io.BytesIO(r.content)
                read_pdf = PdfFileReader(f)
                number_of_pages = read_pdf.getNumPages()
            elif j < (len(pdf_names)):
                pdf_files = pdf_names[j]
                f = open(pdf_files, 'rb')
                read_pdf = PdfFileReader(f)
                number_of_pages = read_pdf.getNumPages()
            else:
                number_of_pages = 0
    if method == "pdfminer":
        logging.info("PTT_METHOD: {0}".format(method))
        text = ""
        for j in range(0, len(pdf_names) + 1):
            if (len(pdf_names) == 0 and pdf_url.endswith('pdf')):
                r = requests.get(pdf_url)
                f = io.BytesIO(r.content)
                read_pdf = PdfFileReader(f)
                number_of_pages = read_pdf.getNumPages()
            elif j < (len(pdf_names)):
                pdf_files = pdf_names[j]
                text = ""
                text = convert_pdf_to_txt(pdf_files)
                number_of_pages = 0
            else:
                number_of_pages = 0
    if method == "none":
        logging.info("PDF_NOT_PERFORMED")
    if number_of_pages > 0:
        for i in range(number_of_pages):
            page = read_pdf.getPage(i)
            page_content = page.extractText()
            text += page_content
    processed_txt = clean_text(text)
    text = ''.join([i for i in processed_txt if not i.isdigit()])
    text = ' '.join(text.split())
    logging.info("PTT_STOP")
    text_dict = {"text": text, "no_of_pages": number_of_pages}
    return text_dict
    def run(self, write_to_kafkaTopic):
        path_to_contentKeywords = self.inputs["path_to_contentKeywords"].read()
        pathTocredentials = self.inputs["pathTocredentials"].read_loc()
        timestamp_folder = os.path.split(path_to_contentKeywords)[0]
        timestr = os.path.split(timestamp_folder)[1]
        epoch_time = time.mktime(time.strptime(timestr, "%Y%m%d-%H%M%S"))
        content_to_textpath = os.path.join(timestamp_folder, "content_to_text")
        cid_name = [
            i for i in os.listdir(content_to_textpath)
            if i not in ['.DS_Store']
        ]
        for cid in cid_name:
            merge_json_list = []
            json_file = findFiles(os.path.join(content_to_textpath, cid),
                                  ["json"])
            for file in json_file:
                if os.path.split(file)[1] in [
                        "ML_keyword_info.json", "ML_content_info.json"
                ]:
                    merge_json_list.append(file)
            ignore_list = ["ets"]
            dict_list = []
            for file in merge_json_list:
                with open(file, "r", encoding="UTF-8") as info:
                    new_json = json.load(info)
                    [
                        new_json.pop(ignore) for ignore in ignore_list
                        if ignore in new_json.keys()
                    ]
                dict_list.append(new_json)
            # merge the nested jsons:-
            autotagging_json = reduce(merge_json, dict_list)
            autotagging_json.update({"ets": epoch_time})
            with open(
                    os.path.join(timestamp_folder, "content_to_text", cid,
                                 "autoTagging_json.json"), "w+") as main_json:
                json.dump(autotagging_json,
                          main_json,
                          sort_keys=True,
                          indent=4)
            # writing to kafka topic:-
            kafka_cli = KafkaCLI(pathTocredentials)
            status = kafka_cli.write(autotagging_json, write_to_kafkaTopic)
            if status:
                logging.info(
                    "******Transaction event successfully pushed to topic:{0}".
                    format(write_to_kafkaTopic))

            else:
                logging.info("******Error pushing the event")
        # Remove the timestamp folder:-
        shutil.rmtree(timestamp_folder)
Ejemplo n.º 8
0
def ecar_unzip(download_location, copy_location):  #ekstep_ecar_unzip
    """
    This function unzips an ecar file(ekstep file format)
    and parses all the subfolder.
    All the files are copied into one of ``'assets','data','items'`` folder
    (same name as in downloaded folder is maintained)
    based on its location in the downloaded folder.
    :param download_location(str): A location in the disk where ekstep ecar resource file is downloaded
    :param copy_location(str): A disk location where the ecar is unwrapped
    """
    assert isinstance(download_location, str)
    assert isinstance(copy_location, str)
    if not os.path.exists(copy_location):
        os.makedirs(copy_location)
    #To make the new sub-directories in which the files will be eventually stored
    location = [
        os.path.join(copy_location, folder)
        for folder in ['assets', 'data', 'items']
    ]
    for loc in location:
        if not os.path.exists(loc):
            os.makedirs(loc)
    ecar_extensions = [
        'png', 'gif', 'jpg', 'mp4', 'webm', 'pdf', 'mp3', 'ecml'
    ]
    files_found = findFiles(download_location, ecar_extensions)
    if files_found:
        for file in files_found:
            if file[-4:] in "ecml":
                shutil.copy(file, copy_location)
            else:
                shutil.copy(file, os.path.join(copy_location, "assets"))
    else:
        print("No files to copy!")
    # Delete the messy download directory
    if os.path.exists(download_location):
        shutil.rmtree(download_location)
Ejemplo n.º 9
0
def image_to_text(method, path_to_assets):
    logging.info("ITT_START")
    image_text = ""
    image_names = findFiles(path_to_assets, ['png', 'gif', 'jpg'])
    if method == "googleVision" and len(image_names) > 0:
        try:
            logging.info('...detected {0} video files'.format(
                str(len(image_names))))
            logging.info('...image file processing started')
            for file in image_names:
                try:
                    image_text += getImgTags(file)
                except BaseException:
                    print('........ Error: could not process file')
            print("Text: ", image_text)
            text = list(str(image_text.lower()).split("\n"))
            image_text = ' '.join(list(set(text)))
        except BaseException:
            image_text = ""
    if method == "none":
        logging.info("ITT_NOT_PERFORMED")
    logging.info("ITT_STOP")
    text_dict = {"text": image_text}
    return text_dict
Ejemplo n.º 10
0
def multimodal_text_enrichment(index, timestr, content_meta, content_type,
                               content_to_text_path,
                               GOOGLE_APPLICATION_CREDENTIALS):
    """
    A custom function to extract text from a given
    Content id in a Content meta dataframe extracted using Content V2 api
    Parameters
    ----------
    :param index(int): row id for the Content
    :param content_meta(dataframe): A dataframe of Content metadata.
     Mandatory fields for content_meta are ``['artifactUrl', 'content_type','downloadUrl',
     'gradeLevel', 'identifier','keywords',
     'language', 'subject']``
    :param content_type(str): Can be ``youtube, pdf, ecml, unknown``
    :param content_to_text_path(str): path to save the extracted text
    :returns: Path where text is saved
    """
    type_of_url = content_meta.iloc[index]["derived_contentType"]
    id_name = content_meta["identifier"][index]
    downloadField = content_type[type_of_url]["contentDownloadField"]
    url = content_meta[downloadField][index]
    logging.info("MTT_START_FOR_INDEX {0}".format(index))
    logging.info("MTT_START_FOR_CID {0}".format(id_name))
    logging.info("MTT_START_FOR_URL {0}".format(url))
    # start text extraction pipeline:
    try:
        start = time.time()
        path_to_id = download_content(type_of_url, url, content_to_text_path,
                                      id_name)
        print("path_to_id", path_to_id)
        path_to_assets = os.path.join(path_to_id, "assets")
        if type_of_url != "pdf":
            path_to_audio = video_to_speech(
                content_type[type_of_url]["video_to_speech"], path_to_assets)
            print(path_to_audio)
        if len(findFiles(path_to_assets, ["mp3"])) > 0:
            audio = AudioSegment.from_mp3(
                findFiles(path_to_assets, ["mp3"])[0])
            duration = round(len(audio) / 1000)
        else:
            duration = 0
        textExtraction_pipeline = [
            (speech_to_text, (content_type[type_of_url]["speech_to_text"],
                              path_to_assets, GOOGLE_APPLICATION_CREDENTIALS)),
            (image_to_text, (content_type[type_of_url]["image_to_text"],
                             path_to_assets)),
            (pdf_to_text, (content_type[type_of_url]["pdf_to_text"],
                           path_to_assets, url)),
            (ecml_index_to_text,
             (content_type[type_of_url]["ecml_index_to_text"], path_to_id))
        ]
        path_to_transcript = os.path.join(path_to_id, "enriched_text.txt")
        text = ""
        for method, param_tuple in textExtraction_pipeline:
            text += method(*param_tuple)["text"]
        # Adding description and title to the text only for PDF content
        if type_of_url == "pdf":
            text += content_meta["name"].iloc[index] + " " + content_meta[
                "description"].iloc[index]
        if os.path.exists(path_to_id) and text:
            with open(path_to_transcript, "w") as myTextFile:
                myTextFile.write(text)
        # num_of_PDFpages = pdf_to_text("none", path_to_assets, url)["no_of_pages"]
        # Reading pdata
        airflow_home = os.getenv('AIRFLOW_HOME',
                                 os.path.expanduser('~/airflow'))
        dag_location = os.path.join(airflow_home, 'dags')
        print("AIRFLOW_HOME: ", dag_location)
        filename = os.path.join(dag_location, 'graph_location')
        f = open(filename, "r")
        lines = f.read().splitlines()
        pdata = lines[-1]
        f.close()

        # estimating ets:
        epoch_time = time.mktime(time.strptime(timestr, "%Y%m%d-%H%M%S"))
        domain = content_meta["subject"][index]
        object_type = content_meta["objectType"][index]
        template = ""
        plugin_used = []
        num_of_stages = 0
        # only for type ecml
        if type_of_url == "ecml":
            plugin_used = ecml_index_to_text("parse",
                                             path_to_id)["plugin_used"]
            num_of_stages = ecml_index_to_text("parse",
                                               path_to_id)["num_stage"]

        mnt_output_dict_new = {
            "ets": int(epoch_time),  #Event generation time in epoch
            "nodeUniqueId": id_name,  #content id
            "operationType": "UPDATE",  #default to UPDATE
            "nodeType": "DATA_NODE",  #default to DATA_NODE
            "graphId": domain,  #default to domain
            "objectType":
            object_type,  #object type - content, worksheet, textbook, collection etc
            "nodeGraphId": 0,  #default to 0
            "transactionData": {
                "properties": {
                    "tags": {
                        "system_contentType":
                        type_of_url,  #can be "youtube", "ecml", "pdf"
                        "system_medium": language_detection(
                            text
                        ),  #generated using google language detection api
                        "duration": {
                            "video": "",  #video duration in seconds
                            "stage": ""  #can be derived from usage data
                        },
                        "num_stage":
                        num_of_stages,  #pdf: number of pages, ecml:number of stages, video:1
                        "system_plugins":
                        plugin_used,  #id's of plugin used in Content
                        "system_templates":
                        template,  #id's of templates used in Content
                        "text": text,
                    },
                    "version": pdata,  #yaml version
                    "uri": ""  #git commit id
                }
            }
        }
        with open(os.path.join(path_to_id, "ML_content_info.json"),
                  "w") as info:
            mnt_json_dump = json.dump(mnt_output_dict_new, info,
                                      indent=4)  # sort_keys=True,
            print(mnt_json_dump)
        stop = time.time()
        time_consumed = stop - start
        time_consumed_minutes = time_consumed / 60.0
        print("time taken in sec for text enrichment for cid -----> {0} : {1}".
              format(id_name, time_consumed))
        print(
            "time taken in minutes for text enrichment for cid -----> {0} : {1}"
            .format(id_name, time_consumed_minutes))
        logging.info(
            "MTT_TRANSCRIPT_PATH_CREATED: {0}".format(path_to_transcript))
        logging.info("MTT_CONTENT_ID_READ: {0}".format(id_name))
        logging.info("MTT_STOP_FOR_URL {0}".format(url))
        return os.path.join(path_to_id, "ML_content_info.json")
    except BaseException:
        logging.info("TextEnrichment failed for url:{0} with id:{1}".format(
            url, id_name))