def test_content_resuse_scoring_data(): cols = [ 'STB_Id', 'STB_Grade', 'STB_Section', 'STB_Text', 'Ref_id', 'Ref_Grade', 'Ref_Section', 'Ref_Text' ] case1 = pd.read_csv(test_case_data_location + "df_feature_check/" + "content_reuse_preparation_feature_check.csv") assert df_feature_check(case1, cols)
def test_content_reuse_evaluation_data(): cols = [ 'state_topic_id', 'reference_topic_id', 'pred_label_percentage', 'TP_count', 'FP_count', 'TN_count', 'FN_count', 'actual_label' ] case1 = pd.read_csv(test_case_data_location + "df_feature_check/" + "content_reuse_evaluation_feature_check.csv") assert df_feature_check(case1, cols)
def test_df_feature_check(): case1 = pd.read_csv( test_case_data_location + "df_feature_check/" + "Content_Meta_feature_checking_df_1.csv") case2 = pd.read_csv( test_case_data_location + "df_feature_check/" + "Content_Meta_feature_checking_df_2.csv") case3 = pd.read_csv( test_case_data_location + "df_feature_check/" + "Content_Meta_feature_checking_df_3.csv") mandatatory_field_location = test_case_data_location + \ "df_feature_check/" + "ContentTagging_mandatory_fields.yaml" with open(mandatatory_field_location, 'r') as stream: data = yaml.load(stream) mandatatory_field_ls = list(data['mandatory_fields']) assert df_feature_check(case1, mandatatory_field_ls) assert df_feature_check(case2, mandatatory_field_ls) == False assert df_feature_check(case3, mandatatory_field_ls) == False
def run(self, range_start, range_end, num_of_processes, content_type): """ This is the main method to derive when creating an operator. This takes in the parameters, runs text enrichment pipline and writes back the path to the timestamp folder with the content id and its enriched text to an h5 file that gets saved as an intermediate result """ DS_DATA_HOME = self.inputs["DS_DATA_HOME"].read_loc() pathTocredentials = self.inputs["pathTocredentials"].read_loc() timestr = time.strftime("%Y%m%d-%H%M%S") path_to_timestamp_folder = os.path.join(DS_DATA_HOME, timestr) content_to_text_path = os.path.join(path_to_timestamp_folder, "content_to_text") # content dump: if not os.path.exists(content_to_text_path): os.makedirs(content_to_text_path) print("content_to_text: ", content_to_text_path) contentmeta_path = self.inputs["localpathTocontentMeta"].read_loc() # move the content meta to timestamp folder[destination folder] #for the time being experiment with copy: change it later. shutil.move( contentmeta_path, os.path.join(path_to_timestamp_folder, os.path.split(contentmeta_path)[1])) moved_contentmeta_path = os.path.join( path_to_timestamp_folder, os.path.split(contentmeta_path)[1]) content_meta = pd.read_csv(moved_contentmeta_path) if "derived_contentType" not in list(content_meta.columns): content_meta["derived_contentType"] = np.nan for row_ind, artifact_url in enumerate( content_meta["artifactUrl"]): try: content_meta["derived_contentType"][ row_ind] = identify_contentType(artifact_url) except BaseException: pass content_meta = content_meta[pd.notnull( content_meta['derived_contentType'])] content_meta.reset_index(inplace=True, drop=True) print(self.outputs["timestamp_folder"].location_specify()) oldwd = os.getcwd() contentMeta_mandatory_fields = [ 'artifactUrl', 'derived_contentType', 'downloadUrl', 'gradeLevel', 'identifier', 'language', 'subject', 'graph_id', 'nodeType', 'objectType', 'node_id' ] assert df_feature_check(content_meta, contentMeta_mandatory_fields) logging.info("CTT_CONTENT_TO_TEXT_START") # read content meta: if content_meta.columns[0] == "0": content_meta = content_meta.drop("0", axis=1) # check for duplicates in the meta if list(content_meta[content_meta.duplicated( ['artifactUrl'], keep=False)]["artifactUrl"]) != []: content_meta.drop_duplicates(subset="artifactUrl", inplace=True) content_meta.reset_index(drop=True, inplace=True) # dropna from artifactUrl feature and reset the index: content_meta.dropna(subset=["artifactUrl"], inplace=True) content_meta.reset_index(drop=True, inplace=True) # time the run start = time.time() logging.info('Contents detected in the content meta: ' + str(len(content_meta))) logging.info( "----Running Content_to_Text for contents from {0} to {1}:".format( range_start, range_end)) logging.info("time started: {0}".format(start)) # subset contentMeta: # content_meta = content_meta[content_meta["derived_contentType"].isin( # subset_contentMeta_by.split(", "))] content_meta.reset_index(drop=True, inplace=True) if range_start == "START": range_start = 0 if range_end == "END": range_end = len(content_meta) logging.info( "CTT_Config: content_meta from {0} to {1} created in: {2}".format( range_start, range_end, content_to_text_path)) print("Number of processes: ", num_of_processes) status = False if os.path.exists(pathTocredentials): try: config = configparser.ConfigParser(allow_no_value=True) config.read(pathTocredentials) status = True try: path_to_googlecred = config[ 'google application credentials'][ "GOOGLE_APPLICATION_CREDENTIALS"] with open(path_to_googlecred, "r") as cred_json: GOOGLE_APPLICATION_CREDENTIALS = cred_json.read() except BaseException: logging.info( "Invalid GOOGLE_APPLICATION_CREDENTIALS in config.") logging.info( "***Checking for GOOGLE_APPLICATION_CREDENTIALS environment variable" ) status = False except BaseException: logging.info("Invalid config file") logging.info( "***Checking for GOOGLE_APPLICATION_CREDENTIALS environment variable" ) if not status: try: GOOGLE_APPLICATION_CREDENTIALS = os.environ[ 'GOOGLE_APPLICATION_CREDENTIALS'] with open(GOOGLE_APPLICATION_CREDENTIALS, "r") as f: GOOGLE_APPLICATION_CREDENTIALS = f.read() except BaseException: GOOGLE_APPLICATION_CREDENTIALS = "" logging.info("Not a valid google credential") result = [ multimodal_text_enrichment(i, timestr, content_meta, content_type, content_to_text_path, GOOGLE_APPLICATION_CREDENTIALS) for i in range(range_start, range_end) ] print(result) os.chdir(oldwd) print("Current directory c2t: ", os.getcwd()) print("timestamp_folder path:", path_to_timestamp_folder) self.outputs["timestamp_folder"].write(path_to_timestamp_folder)