def post_process_instruction(df_instruction, df_request, df_post, survey_item_prefix): for i, row in df_instruction.iterrows(): if df_post.empty == False: last_row = df_post.iloc[-1] if last_row['text'] != row['text']: if 'QItem' in df_request.QuestionElement.unique(): rowqitem = df_request[df_request['QuestionElement'] == 'QItem'] item_name = adjust_item_name( rowqitem['QuestionElementNr'].values[0], rowqitem['item_name'].values[0]) else: item_name = row['item_name'] if df_post.empty: survey_item_id = ut.get_survey_item_id(survey_item_prefix) else: survey_item_id = ut.update_survey_item_id( survey_item_prefix) data = { 'survey_item_ID': survey_item_id, 'Study': row['Study'], 'module': row['module'], 'item_type': row['item_type'], 'item_name': item_name, 'item_value': row['item_value'], 'text': row['text'] } df_post = df_post.append(data, ignore_index=True) else: if 'QItem' in df_request.QuestionElement.unique(): rowqitem = df_request[df_request['QuestionElement'] == 'QItem'] item_name = adjust_item_name( rowqitem['QuestionElementNr'].values[0], rowqitem['item_name'].values[0]) else: item_name = row['item_name'] if df_post.empty: survey_item_id = ut.get_survey_item_id(survey_item_prefix) else: survey_item_id = ut.update_survey_item_id( survey_item_prefix) data = { 'survey_item_ID': survey_item_id, 'Study': row['Study'], 'module': row['module'], 'item_type': row['item_type'], 'item_name': item_name, 'item_value': row['item_value'], 'text': row['text'] } df_post = df_post.append(data, ignore_index=True) return df_post
def preprocess_instruction_segment(row, df_questionnaire, survey_item_prefix, splitter): """ Extracts and processes the instruction segments from the input file. Args: param1 row (pandas dataframe object): dataframe row being currently analyzed. param2 df_questionnaire (pandas dataframe): pandas dataframe to store questionnaire data. param3 survey_item_prefix (string): prefix of survey_item_ID. param4 splitter (NLTK object): NLTK object for sentence segmentation instantiated in accordance to the language. Returns: updated df_questionnaire with new valid instruction segments. """ raw_item = replace_abbreviations_and_fills(row['text']) sentences = splitter.tokenize(raw_item) for sentence in sentences: if df_questionnaire.empty: survey_item_id = ut.get_survey_item_id(survey_item_prefix) else: survey_item_id = ut.update_survey_item_id(survey_item_prefix) data = { "survey_item_ID": survey_item_id, 'Study': survey_item_prefix[:-1], 'module': retrieve_module_from_item_name(row['name']), 'item_type': 'INSTRUCTION', 'item_name': row['name'], 'item_value': None, 'text': sentence } df_questionnaire = df_questionnaire.append(data, ignore_index=True) return df_questionnaire
def questionnaire_post_processing(df_with_intro, survey_item_prefix): ut.reset_initial_sufix() df_post = pd.DataFrame(columns=[ 'survey_item_ID', 'Study', 'module', 'item_type', 'item_name', 'item_value', 'text' ]) unique_item_names = df_with_intro.item_name.unique() for unique_item_name in unique_item_names: df_by_item_name = df_with_intro[df_with_intro['item_name'] == unique_item_name] df_instruction = df_by_item_name[df_by_item_name['item_type'] == 'INSTRUCTION'] df_introduction = df_by_item_name[df_by_item_name['item_type'] == 'INTRODUCTION'] df_request = df_by_item_name[df_by_item_name['item_type'] == 'REQUEST'] df_response = df_by_item_name[df_by_item_name['item_type'] == 'RESPONSE'] del df_response['QuestionElement'] if df_instruction.empty == False: df_post = post_process_instruction(df_instruction, df_request, df_post, survey_item_prefix) if df_introduction.empty == False: for i, row in df_introduction.iterrows(): if 'QItem' in df_request.QuestionElement.unique(): rowqitem = df_request[df_request['QuestionElement'] == 'QItem'] item_name = adjust_item_name( rowqitem['QuestionElementNr'].values[0], rowqitem['item_name'].values[0]) else: item_name = row['item_name'] if df_post.empty: survey_item_id = ut.get_survey_item_id(survey_item_prefix) else: survey_item_id = ut.update_survey_item_id( survey_item_prefix) data = { 'survey_item_ID': survey_item_id, 'Study': row['Study'], 'module': row['module'], 'item_type': row['item_type'], 'item_name': item_name, 'item_value': row['item_value'], 'text': row['text'] } df_post = df_post.append(data, ignore_index=True) df_post = post_process_request_response(df_request, df_response, df_post, survey_item_prefix) return df_post
def process_request(df, row, survey_item_prefix, item_name, module, splitter): """ Processes request segments. Args: param1 df (pandas dataframe): dataframe to store processed questionnaire data. param2 row (pandas dataframe row): row of dataframe with contents of the input file being analyzed in outer loop. param3 survey_item_prefix (string): prefix of survey_item_ID. param4 item_name (string): item_name metadata parameter, retrieved in previous steps. param5 module (string): module metadata parameter, retrieved in previous steps. param6 splitter (NLTK object): sentence segmentation from NLTK library. Returns: a pandas dataframe with preprocessed request segments. """ request = row['Request for answer text'] study, country_language = get_country_language_and_study_info( survey_item_prefix) if request != '.' and isinstance(request, str): request = clean_text(request) sentences = splitter.tokenize(request) for sentence in sentences: if sentence != '...': if df.empty: survey_item_id = ut.get_survey_item_id(survey_item_prefix) else: survey_item_id = ut.update_survey_item_id( survey_item_prefix) if check_if_segment_is_instruction(sentence, country_language): item_type = 'INSTRUCTION' else: item_type = 'REQUEST' data = { 'survey_item_ID': survey_item_id, 'Study': study, 'module': module, 'item_type': item_type, 'item_name': item_name, 'item_value': None, 'text': sentence } df = df.append(data, ignore_index=True) return df
def extract_wis_data(df, df_questionnaire, study): """ Extracts and preprocesses WIS data from df, attibuting MCSQ metadata (and also harmonizing metadata e.g. item names, item types, when necessary). Args: param1 df (pandas dataframe): the input data in a dataframe representation. param2 df_questionnaire (pandas dataframe): a dataframe to hold the processed questionnaire data. param3 study (string): the name of the study, embedded in the WIS export filename. Returns: the df_questionnaire (pandas dataframe) with the preprocessed data. """ survey_item_prefix_source = instantiate_survey_item_prefix(study, 'source') survey_item_prefix_eng = instantiate_survey_item_prefix(study, 'en_GB') survey_item_prefix_fre = instantiate_survey_item_prefix(study, 'fr_FR') survey_item_prefix_por = instantiate_survey_item_prefix(study, 'pt_PT') survey_item_prefix_rus = instantiate_survey_item_prefix(study, 'ru_RU') survey_item_prefix_ger = instantiate_survey_item_prefix(study, 'de_DE') survey_item_prefix_spa = instantiate_survey_item_prefix(study, 'es_ES') survey_item_prefix_nor = instantiate_survey_item_prefix(study, 'no_NO') survey_item_prefix_cze = instantiate_survey_item_prefix(study, 'cs_CZ') df['NEXT_ITEM_TYPE'] = df['ITEM_TYPE'].shift(-1) flag = 0 for i, row in df.iterrows(): if isinstance( row['PAGE'], str) and row['PAGE'] != 'ALERT_1' and row['PAGE'] != 'ALERT_2': item_name = simplify_item_name( row['UNIQUE IDENTIFIER (PER SURVEY)']) item_value = row['VALUES of VAR'] wis_item_type = row['ITEM_TYPE'] item_type = harmonize_item_type(wis_item_type) if str( row['source'] ) != 'TRANSLATION IN TASK_API' and 'NO TRANSLATION' not in str( row['en_GB']): if isinstance(row['source'], str) or isinstance( row['source'], int): if wis_item_type == 'matrix question' and row[ 'NEXT_ITEM_TYPE'] != 'matrix option': if flag == 0: count = 0 flag = 1 else: count = count + 1 df_questionnaire.iloc[ -1, df_questionnaire.columns.get_loc( 'item_name')] = item_name + '_' + str(count) df['item_name'] = df[ 'UNIQUE IDENTIFIER (PER SURVEY)'].replace( to_replace=row['LAST_ITEM_NAME'], value=item_name + '_introD') df_by_item_name = df[ df['UNIQUE IDENTIFIER (PER SURVEY)'] == row['UNIQUE IDENTIFIER (PER SURVEY)']] df_by_item_name_responses = df_by_item_name[ df_by_item_name['ITEM_TYPE'] == 'matrix option'] if df_questionnaire.empty: survey_item_id_source = ut.get_survey_item_id( survey_item_prefix_source) else: survey_item_id_source = ut.update_survey_item_id( survey_item_prefix_source) data = { 'Study': study, 'module': row['PAGE'], 'item_type': item_type, 'wis_item_type': wis_item_type, 'item_name': item_name + '_' + str(count), 'item_value': item_value, 'ENG_SOURCE_survey_item_ID': survey_item_id_source, 'ENG_SOURCE_text': row['source'], 'ENG_GB_survey_item_ID': ut.get_survey_item_id(survey_item_prefix_eng), 'ENG_GB_text': row['en_GB'], 'FRE_FR_survey_item_ID': ut.get_survey_item_id(survey_item_prefix_fre), 'FRE_FR_text': row['fr_FR'], 'POR_PT_survey_item_ID': ut.get_survey_item_id(survey_item_prefix_por), 'POR_PT_text': row['pt_PT'], 'RUS_RU_survey_item_ID': ut.get_survey_item_id(survey_item_prefix_rus), 'RUS_RU_text': row['ru_RU'], 'GER_DE_survey_item_ID': ut.get_survey_item_id(survey_item_prefix_ger), 'GER_DE_text': row['de_DE'], 'SPA_ES_survey_item_ID': ut.get_survey_item_id(survey_item_prefix_spa), 'SPA_ES_text': row['es_ES'], 'NOR_NO_survey_item_ID': ut.get_survey_item_id(survey_item_prefix_nor), 'NOR_NO_text': row['no_NO'], 'CZE_CZ_survey_item_ID': ut.get_survey_item_id(survey_item_prefix_cze), 'CZE_CZ_text': row['cs_CZ'] } df_questionnaire = df_questionnaire.append( data, ignore_index=True) for i, row in df_by_item_name_responses.iterrows(): survey_item_id_source = ut.update_survey_item_id( survey_item_prefix_source) item_name = simplify_item_name( row['UNIQUE IDENTIFIER (PER SURVEY)']) item_value = row['VALUES of VAR'] wis_item_type = row['ITEM_TYPE'] item_type = harmonize_item_type(wis_item_type) data = { 'Study': study, 'module': row['PAGE'], 'item_type': item_type, 'wis_item_type': wis_item_type, 'item_name': item_name + '_' + str(count), 'item_value': item_value, 'ENG_SOURCE_survey_item_ID': survey_item_id_source, 'ENG_SOURCE_text': row['source'], 'ENG_GB_survey_item_ID': ut.get_survey_item_id(survey_item_prefix_eng), 'ENG_GB_text': row['en_GB'], 'FRE_FR_survey_item_ID': ut.get_survey_item_id(survey_item_prefix_fre), 'FRE_FR_text': row['fr_FR'], 'POR_PT_survey_item_ID': ut.get_survey_item_id(survey_item_prefix_por), 'POR_PT_text': row['pt_PT'], 'RUS_RU_survey_item_ID': ut.get_survey_item_id(survey_item_prefix_rus), 'RUS_RU_text': row['ru_RU'], 'GER_DE_survey_item_ID': ut.get_survey_item_id(survey_item_prefix_ger), 'GER_DE_text': row['de_DE'], 'SPA_ES_survey_item_ID': ut.get_survey_item_id(survey_item_prefix_spa), 'SPA_ES_text': row['es_ES'], 'NOR_NO_survey_item_ID': ut.get_survey_item_id(survey_item_prefix_nor), 'NOR_NO_text': row['no_NO'], 'CZE_CZ_survey_item_ID': ut.get_survey_item_id(survey_item_prefix_cze), 'CZE_CZ_text': row['cs_CZ'] } df_questionnaire = df_questionnaire.append( data, ignore_index=True) else: flag = 0 if df_questionnaire.empty: survey_item_id_source = ut.get_survey_item_id( survey_item_prefix_source) else: survey_item_id_source = ut.update_survey_item_id( survey_item_prefix_source) data = { 'Study': study, 'module': row['PAGE'], 'item_type': item_type, 'wis_item_type': wis_item_type, 'item_name': item_name, 'item_value': item_value, 'ENG_SOURCE_survey_item_ID': survey_item_id_source, 'ENG_SOURCE_text': row['source'], 'ENG_GB_survey_item_ID': ut.get_survey_item_id(survey_item_prefix_eng), 'ENG_GB_text': row['en_GB'], 'FRE_FR_survey_item_ID': ut.get_survey_item_id(survey_item_prefix_fre), 'FRE_FR_text': row['fr_FR'], 'POR_PT_survey_item_ID': ut.get_survey_item_id(survey_item_prefix_por), 'POR_PT_text': row['pt_PT'], 'RUS_RU_survey_item_ID': ut.get_survey_item_id(survey_item_prefix_rus), 'RUS_RU_text': row['ru_RU'], 'GER_DE_survey_item_ID': ut.get_survey_item_id(survey_item_prefix_ger), 'GER_DE_text': row['de_DE'], 'SPA_ES_survey_item_ID': ut.get_survey_item_id(survey_item_prefix_spa), 'SPA_ES_text': row['es_ES'], 'NOR_NO_survey_item_ID': ut.get_survey_item_id(survey_item_prefix_nor), 'NOR_NO_text': row['no_NO'], 'CZE_CZ_survey_item_ID': ut.get_survey_item_id(survey_item_prefix_cze), 'CZE_CZ_text': row['cs_CZ'] } df_questionnaire = df_questionnaire.append( data, ignore_index=True) return df_questionnaire
def post_process_request_response(df_request, df_response, df_post, survey_item_prefix): for i, row in df_request.iterrows(): if df_post.empty == False: last_row = df_post.iloc[-1] if last_row['text'] != row['text']: if 'QItem' in df_request.QuestionElement.unique(): item_name = adjust_item_name(row['QuestionElementNr'], row['item_name']) else: item_name = row['item_name'] if df_post.empty: survey_item_id = ut.get_survey_item_id(survey_item_prefix) else: survey_item_id = ut.update_survey_item_id( survey_item_prefix) data = { 'survey_item_ID': survey_item_id, 'Study': row['Study'], 'module': row['module'], 'item_type': row['item_type'], 'item_name': item_name, 'item_value': row['item_value'], 'text': row['text'] } df_post = df_post.append(data, ignore_index=True) if df_response.empty == False: if row['QuestionElement'] == 'QItem': for i, response_row in df_response.iterrows(): data = { 'survey_item_ID': ut.update_survey_item_id(survey_item_prefix), 'Study': response_row['Study'], 'module': response_row['module'], 'item_type': response_row['item_type'], 'item_name': item_name, 'item_value': response_row['item_value'], 'text': response_row['text'] } df_post = df_post.append(data, ignore_index=True) else: if 'QItem' in df_request.QuestionElement.unique(): item_name = adjust_item_name(row['QuestionElementNr'], row['item_name']) else: item_name = row['item_name'] if df_post.empty: survey_item_id = ut.get_survey_item_id(survey_item_prefix) else: survey_item_id = ut.update_survey_item_id(survey_item_prefix) data = { 'survey_item_ID': survey_item_id, 'Study': row['Study'], 'module': row['module'], 'item_type': row['item_type'], 'item_name': item_name, 'item_value': row['item_value'], 'text': row['text'] } df_post = df_post.append(data, ignore_index=True) if df_response.empty == False: if row['QuestionElement'] == 'QItem': for i, response_row in df_response.iterrows(): data = { 'survey_item_ID': ut.update_survey_item_id(survey_item_prefix), 'Study': response_row['Study'], 'module': response_row['module'], 'item_type': response_row['item_type'], 'item_name': item_name, 'item_value': response_row['item_value'], 'text': response_row['text'] } df_post = df_post.append(data, ignore_index=True) if df_response.empty == False: last_row = df_post.iloc[-1] if last_row['item_type'] != 'RESPONSE': for i, response_row in df_response.iterrows(): data = { 'survey_item_ID': ut.update_survey_item_id(survey_item_prefix), 'Study': response_row['Study'], 'module': response_row['module'], 'item_type': response_row['item_type'], 'item_name': last_row['item_name'], 'item_value': response_row['item_value'], 'text': response_row['text'] } df_post = df_post.append(data, ignore_index=True) return df_post
def main(filename): extract_source = 0 """ Parse the input XML file by filename """ file = str(filename) tree = ET.parse(file) root = tree.getroot() """ Create a dictionary containing parent-child relations of the parsed tree """ parent_map = dict((c, p) for p in tree.getiterator() for c in p) ess_questions_instructions = root.findall('.//questionnaire/questions') ess_answers = root.findall('.//questionnaire/answers') ess_showcards = root.findall('.//questionnaire/showcards') df_questionnaire, survey_item_prefix, study, country_language, splitter = set_initial_structures( filename, extract_source) ess_special_answer_categories = instantiate_special_answer_category_object( country_language) if 'GER_AT' in filename: ess_special_answer_categories.refuse[0] = 'Verweigert' ess_special_answer_categories.dont_know[0] = 'Weiß nicht' if 'GER_CH' in filename: ess_special_answer_categories.refuse[0] = 'Antwort verweigert' ess_special_answer_categories.dont_know[0] = 'Weiss nicht' if 'GER_DE' in filename: ess_special_answer_categories.refuse[0] = 'Antwort verweigert' if 'NOR_NO' in filename: ess_special_answer_categories.refuse[0] = 'Nekter' df_question_instruction = pd.DataFrame( columns=['answer_id', 'item_name', 'item_type', 'text']) df_answers = pd.DataFrame( columns=['answer_id', 'item_name', 'text', 'item_value']) item_value = None df_question_instruction = process_question_instruction_node( ess_questions_instructions, df_question_instruction, parent_map, splitter, country_language, extract_source) df_answers = process_answer_node(ess_answers, df_answers, parent_map, ess_special_answer_categories, extract_source) unique_item_names_question_instruction = df_question_instruction.item_name.unique( ) for unique_item_name in unique_item_names_question_instruction: df_question_instruction_by_item_name = df_question_instruction[ df_question_instruction['item_name'].str.lower() == unique_item_name.lower()] df_answers_by_item_name = df_answers[ df_answers['item_name'].str.lower() == unique_item_name.lower()] module = retrieve_item_module(unique_item_name, study) last_item_name = '' for i, row in df_question_instruction_by_item_name.iterrows(): item_name = row['item_name'] if item_name == 'Instruction' or item_name == 'Intro': item_name = last_item_name if 'Row ' not in item_name and item_name != 'CI' and item_name != 'outro' and 'istration Note' not in item_name and item_name != 'box': if df_questionnaire.empty: survey_item_id = ut.get_survey_item_id(survey_item_prefix) else: survey_item_id = ut.update_survey_item_id( survey_item_prefix) if check_if_segment_is_instruction(row['text'], country_language): item_type = 'INSTRUCTION' else: item_type = row['item_type'] data = { 'survey_item_ID': survey_item_id, 'Study': study, 'module': module, 'item_type': item_type, 'item_name': item_name, 'item_value': None, 'text': row['text'] } df_questionnaire = df_questionnaire.append(data, ignore_index=True) last_item_name = item_name if df_answers_by_item_name.empty == False: for j, row in df_answers_by_item_name.iterrows(): if row['item_value'] == 'None': item_value = None else: item_value = row['item_value'] data = { 'survey_item_ID': ut.update_survey_item_id(survey_item_prefix), 'Study': study, 'module': module, 'item_type': row['item_type'], 'item_name': item_name, 'item_value': item_value, 'text': row['text'] } df_questionnaire = df_questionnaire.append(data, ignore_index=True) # df_question_instruction.to_csv('questions.csv', encoding='utf-8-sig', index=False) # df_answers.to_csv('answers.csv', encoding='utf-8-sig', index=False) df_questionnaire.to_csv(survey_item_prefix[:-1] + '.csv', encoding='utf-8-sig', sep='\t', index=False)
def process_question_segment(raw_item, survey_item_prefix, study, item_name, df_questionnaire, splitter, country_language): """ Extracts and processes the question segments from a raw item. The question segments are always between the {QUESTION} and {ANSWERS} tags, for instance: G2 {QUESTION} Per a ell és important ser ric. Vol tenir molts diners i coses cares. {ANSWERS} Se sembla molt a mi Se sembla a mi Se sembla una mica a mi Se sembla poc a mi No se sembla a mi No se sembla gens a mi Args: param1 raw_item (list): raw survey item, retrieved in previous steps. param2 survey_item_prefix (string): prefix of survey_item_ID. param3 study (string): metadata parameter about study embedded in the file name. param4 item_name (string): item_name metadata parameter, retrieved in previous steps. param5 df_questionnaire (pandas dataframe): pandas dataframe to store questionnaire data. param6 splitter (NLTK object): sentence segmentation from NLTK library. param7 country_language (string): country_language metadata, embedded in file name. Returns: updated df_questionnaire when new valid question segments are included, or df_questionnaire in the same state it was when no new valid question segments were included. """ print(raw_item) index_question_tag = raw_item.index('{QUESTION}') index_answer_tag = raw_item.index('{ANSWERS}') question_segment = raw_item[index_question_tag + 1:index_answer_tag] for item in question_segment: item = clean_text(item) item = expand_interviewer_abbreviations(item, country_language) if item != '': sentences = splitter.tokenize(item) for sentence in sentences: if df_questionnaire.empty: survey_item_id = ut.get_survey_item_id(survey_item_prefix) else: survey_item_id = ut.update_survey_item_id( survey_item_prefix) if check_if_segment_is_instruction(sentence, country_language): item_type = 'INSTRUCTION' else: item_type = 'REQUEST' data = { "survey_item_ID": survey_item_id, 'Study': study, 'module': retrieve_item_module(item_name, study), 'item_type': item_type, 'item_name': item_name, 'item_value': None, 'text': sentence } df_questionnaire = df_questionnaire.append(data, ignore_index=True) return df_questionnaire
def process_intro_segment(raw_item, survey_item_prefix, study, item_name, df_questionnaire, splitter): """ Extracts and processes the introduction segments from a raw item. The introduction segments are always between the item name and {QUESTION} tag, for instance: {INTRO} Ara m'agradaria fer-li algunes preguntes sobre política i el govern. B1 {QUESTION} En quina mesura diria vostè que l'interessa la política? Vostè diria que l'interessa... {ANSWERS} Molt Bastant Poc Gens Args: param1 raw_item (list): raw survey item, retrieved in previous steps. param2 survey_item_prefix (string): prefix of survey_item_ID. param3 study (string): metadata parameter about study embedded in the file name. param4 item_name (string): item_name metadata parameter, retrieved in previous steps. param5 df_questionnaire (pandas dataframe): pandas dataframe to store questionnaire data. param6 splitter (NLTK object): sentence segmentation from NLTK library. Returns: updated df_questionnaire when new valid introduction segments are included, or df_questionnaire in the same state it was when no new valid introduction segments were included. """ index_intro_tag = raw_item.index('{INTRO}') index_question_tag = raw_item.index('{QUESTION}') intro_segment = raw_item[index_intro_tag + 1:index_question_tag] for item in intro_segment: item = clean_text(item) if item != '': sentences = splitter.tokenize(item) for sentence in sentences: if df_questionnaire.empty: survey_item_id = ut.get_survey_item_id(survey_item_prefix) else: survey_item_id = ut.update_survey_item_id( survey_item_prefix) data = { "survey_item_ID": survey_item_id, 'Study': study, 'module': retrieve_item_module(item_name, study), 'item_type': 'INTRODUCTION', 'item_name': item_name, 'item_value': None, 'text': sentence } df_questionnaire = df_questionnaire.append(data, ignore_index=True) return df_questionnaire