def post_process_instruction(df_instruction, df_request, df_post, survey_item_prefix): for i, row in df_instruction.iterrows(): if df_post.empty == False: last_row = df_post.iloc[-1] if last_row['text'] != row['text']: if 'QItem' in df_request.QuestionElement.unique(): rowqitem = df_request[df_request['QuestionElement'] == 'QItem'] item_name = adjust_item_name( rowqitem['QuestionElementNr'].values[0], rowqitem['item_name'].values[0]) else: item_name = row['item_name'] if df_post.empty: survey_item_id = ut.get_survey_item_id(survey_item_prefix) else: survey_item_id = ut.update_survey_item_id( survey_item_prefix) data = { 'survey_item_ID': survey_item_id, 'Study': row['Study'], 'module': row['module'], 'item_type': row['item_type'], 'item_name': item_name, 'item_value': row['item_value'], 'text': row['text'] } df_post = df_post.append(data, ignore_index=True) else: if 'QItem' in df_request.QuestionElement.unique(): rowqitem = df_request[df_request['QuestionElement'] == 'QItem'] item_name = adjust_item_name( rowqitem['QuestionElementNr'].values[0], rowqitem['item_name'].values[0]) else: item_name = row['item_name'] if df_post.empty: survey_item_id = ut.get_survey_item_id(survey_item_prefix) else: survey_item_id = ut.update_survey_item_id( survey_item_prefix) data = { 'survey_item_ID': survey_item_id, 'Study': row['Study'], 'module': row['module'], 'item_type': row['item_type'], 'item_name': item_name, 'item_value': row['item_value'], 'text': row['text'] } df_post = df_post.append(data, ignore_index=True) return df_post
def preprocess_instruction_segment(row, df_questionnaire, survey_item_prefix, splitter): """ Extracts and processes the instruction segments from the input file. Args: param1 row (pandas dataframe object): dataframe row being currently analyzed. param2 df_questionnaire (pandas dataframe): pandas dataframe to store questionnaire data. param3 survey_item_prefix (string): prefix of survey_item_ID. param4 splitter (NLTK object): NLTK object for sentence segmentation instantiated in accordance to the language. Returns: updated df_questionnaire with new valid instruction segments. """ raw_item = replace_abbreviations_and_fills(row['text']) sentences = splitter.tokenize(raw_item) for sentence in sentences: if df_questionnaire.empty: survey_item_id = ut.get_survey_item_id(survey_item_prefix) else: survey_item_id = ut.update_survey_item_id(survey_item_prefix) data = { "survey_item_ID": survey_item_id, 'Study': survey_item_prefix[:-1], 'module': retrieve_module_from_item_name(row['name']), 'item_type': 'INSTRUCTION', 'item_name': row['name'], 'item_value': None, 'text': sentence } df_questionnaire = df_questionnaire.append(data, ignore_index=True) return df_questionnaire
def questionnaire_post_processing(df_with_intro, survey_item_prefix): ut.reset_initial_sufix() df_post = pd.DataFrame(columns=[ 'survey_item_ID', 'Study', 'module', 'item_type', 'item_name', 'item_value', 'text' ]) unique_item_names = df_with_intro.item_name.unique() for unique_item_name in unique_item_names: df_by_item_name = df_with_intro[df_with_intro['item_name'] == unique_item_name] df_instruction = df_by_item_name[df_by_item_name['item_type'] == 'INSTRUCTION'] df_introduction = df_by_item_name[df_by_item_name['item_type'] == 'INTRODUCTION'] df_request = df_by_item_name[df_by_item_name['item_type'] == 'REQUEST'] df_response = df_by_item_name[df_by_item_name['item_type'] == 'RESPONSE'] del df_response['QuestionElement'] if df_instruction.empty == False: df_post = post_process_instruction(df_instruction, df_request, df_post, survey_item_prefix) if df_introduction.empty == False: for i, row in df_introduction.iterrows(): if 'QItem' in df_request.QuestionElement.unique(): rowqitem = df_request[df_request['QuestionElement'] == 'QItem'] item_name = adjust_item_name( rowqitem['QuestionElementNr'].values[0], rowqitem['item_name'].values[0]) else: item_name = row['item_name'] if df_post.empty: survey_item_id = ut.get_survey_item_id(survey_item_prefix) else: survey_item_id = ut.update_survey_item_id( survey_item_prefix) data = { 'survey_item_ID': survey_item_id, 'Study': row['Study'], 'module': row['module'], 'item_type': row['item_type'], 'item_name': item_name, 'item_value': row['item_value'], 'text': row['text'] } df_post = df_post.append(data, ignore_index=True) df_post = post_process_request_response(df_request, df_response, df_post, survey_item_prefix) return df_post
def main(folder_path, has_supplementary): """ Main method of the ESS plain text to spreadsheet data transformation algorithm. The data is extracted from the plain text file (that obeys an internal specification for the MCSQ project), preprocessed and receives appropriate metadata attribution. The algorithm outputs the csv representation of the df_questionnaire, used to store questionnaire data (pandas dataframe) Args: param1 folder_path: path to the folder where the plain text files are. param2 has_supplementary: boolean variable that indicates if there is a supplementary spreadsheet to be appended. """ path = os.chdir(folder_path) files = os.listdir(path) for index, file in enumerate(files): if file.endswith(".txt"): with open(file, 'r') as f: df_questionnaire, survey_item_prefix, study, country_language, splitter = set_initial_structures( file) raw_items = retrieve_raw_items_from_file(f) for raw_item in raw_items: item_name = remove_spaces_from_item_name(raw_item[0]) if '{INTRO}' in raw_item: df_questionnaire = process_intro_segment( raw_item, survey_item_prefix, study, item_name, df_questionnaire, splitter) df_questionnaire = process_question_segment( raw_item, survey_item_prefix, study, item_name, df_questionnaire, splitter, country_language) df_questionnaire = process_answer_segment( raw_item, survey_item_prefix, study, item_name, df_questionnaire, country_language) f.close() csv_name = file.replace('.txt', '') if has_supplementary == 1: supplementary = pd.read_csv('SUPP_' + csv_name + '.csv', dtype=str) for i, row in supplementary.iterrows(): supplementary.at[ i, 'survey_item_ID'] = ut.update_survey_item_id( survey_item_prefix) # supplementary.at[i,'item_value'] = str(row['item_value']) df_all = df_questionnaire.append(supplementary, ignore_index=True) df_all.to_csv(str(csv_name) + '.csv', sep='\t', encoding='utf-8-sig', index=False) else: df_questionnaire.to_csv(str(csv_name) + '.csv', sep='\t', encoding='utf-8-sig', index=False)
def include_special_answer_category(survey_item_prefix, study, item_name, module, country_language, df_questionnaire): """ Include special answer categories in the questionnaire, that are not present in SQP file. Args: param1 survey_item_prefix (string): prefix of survey_item_ID. param2 study (string): study metadata parameter, retrieved in previous steps. param3 item_name (string): item_name metadata parameter, retrieved in previous steps. param4 module (string): module metadata parameter, retrieved in previous steps. param5 country_language (string): module metadata parameter, embedded in input file name. param6 df_questionnaire (pandas dataframe): dataframe to store processed questionnaire data. Returns: updated df_questionnaire (pandas dataframe), including special answer categories (defined according to the language) """ ess_special_answer_categories = instantiate_special_answer_category_object( country_language) data = { "survey_item_ID": ut.update_survey_item_id(survey_item_prefix), 'Study': study, 'module': module, 'item_type': 'RESPONSE', 'item_name': item_name, 'item_value': ess_special_answer_categories.dont_know[1], 'text': ess_special_answer_categories.dont_know[0] } df_questionnaire = df_questionnaire.append(data, ignore_index=True) data = { "survey_item_ID": ut.update_survey_item_id(survey_item_prefix), 'Study': study, 'module': module, 'item_type': 'RESPONSE', 'item_name': item_name, 'item_value': ess_special_answer_categories.refuse[1], 'text': ess_special_answer_categories.refuse[0] } df_questionnaire = df_questionnaire.append(data, ignore_index=True) return df_questionnaire
def process_request(df, row, survey_item_prefix, item_name, module, splitter): """ Processes request segments. Args: param1 df (pandas dataframe): dataframe to store processed questionnaire data. param2 row (pandas dataframe row): row of dataframe with contents of the input file being analyzed in outer loop. param3 survey_item_prefix (string): prefix of survey_item_ID. param4 item_name (string): item_name metadata parameter, retrieved in previous steps. param5 module (string): module metadata parameter, retrieved in previous steps. param6 splitter (NLTK object): sentence segmentation from NLTK library. Returns: a pandas dataframe with preprocessed request segments. """ request = row['Request for answer text'] study, country_language = get_country_language_and_study_info( survey_item_prefix) if request != '.' and isinstance(request, str): request = clean_text(request) sentences = splitter.tokenize(request) for sentence in sentences: if sentence != '...': if df.empty: survey_item_id = ut.get_survey_item_id(survey_item_prefix) else: survey_item_id = ut.update_survey_item_id( survey_item_prefix) if check_if_segment_is_instruction(sentence, country_language): item_type = 'INSTRUCTION' else: item_type = 'REQUEST' data = { 'survey_item_ID': survey_item_id, 'Study': study, 'module': module, 'item_type': item_type, 'item_name': item_name, 'item_value': None, 'text': sentence } df = df.append(data, ignore_index=True) return df
def process_answer(df, row, survey_item_prefix, item_name, module): """ Processes answer segments. Args: param1 df (pandas dataframe): dataframe to store processed questionnaire data. param2 row (pandas dataframe row): row of dataframe with contents of the input file being analyzed in outer loop. param3 survey_item_prefix (string): prefix of survey_item_ID. param4 item_name (string): item_name metadata parameter, retrieved in previous steps. param5 module (string): module metadata parameter, retrieved in previous steps. Returns: a pandas dataframe with preprocessed answer segments. """ study, country_language = get_country_language_and_study_info( survey_item_prefix) answer = row['Answer options text'] if answer != '.' and isinstance(answer, str): # answer = remove_undesired_symbols(answer) answer = clean_text(answer) """ Regex matches 0-10 scales with words in item 5 """ if zero_to_ten_with_value_in_five_pattern.match( unidecode.unidecode(answer)): return process_zero_to_ten_with_middle_text_scale( df, answer, survey_item_prefix, study, module, item_name, country_language) """ Regex matches 0-10 scales """ if zero_to_ten_pattern.match(unidecode.unidecode(answer)): return process_zero_to_x_scale(df, '10', answer, survey_item_prefix, study, module, item_name, country_language) """ Regex matches 1-10 scales """ if one_to_ten_pattern.match(unidecode.unidecode(answer)): return process_one_to_x_scale(df, '10', answer, survey_item_prefix, study, module, item_name, country_language) """ Regex matches 0-9 scales """ if zero_to_nine_pattern.match(unidecode.unidecode(answer)): return process_zero_to_x_scale(df, '9', answer, survey_item_prefix, study, module, item_name, country_language) """ Regex matches 0-5 scales """ if zero_to_five_pattern.match(unidecode.unidecode(answer)): return process_zero_to_x_scale(df, '5', answer, survey_item_prefix, study, module, item_name, country_language) """ Regex matches 1-5 scales """ if one_to_five_pattern.match(unidecode.unidecode(answer)): return process_one_to_x_scale(df, '5', answer, survey_item_prefix, study, module, item_name, country_language) """ Regex matches 0-6 scales """ if zero_to_six_pattern.match(unidecode.unidecode(answer)): return process_zero_to_x_scale(df, '6', answer, survey_item_prefix, study, module, item_name, country_language) """ Regex matches 0-7 scales """ if zero_to_seven_pattern.match(unidecode.unidecode(answer)): return process_zero_to_x_scale(df, '7', answer, survey_item_prefix, study, module, item_name, country_language) """ Regex matches 1-7 scales """ if one_to_seven_pattern.match(unidecode.unidecode(answer)): return process_one_to_x_scale(df, '7', answer, survey_item_prefix, study, module, item_name, country_language) """ Regex matches 0-4 scales """ if zero_to_four_pattern.match(unidecode.unidecode(answer)): return process_zero_to_x_scale(df, '4', answer, survey_item_prefix, study, module, item_name, country_language) """ Regex matches 1-4 scales """ if one_to_four_pattern.match(unidecode.unidecode(answer)): return process_one_to_x_scale(df, '4', answer, survey_item_prefix, study, module, item_name, country_language) """ Regex matches 0-3 scales """ if zero_to_three_pattern.match(unidecode.unidecode(answer)): return process_zero_to_x_scale(df, '3', answer, survey_item_prefix, study, module, item_name, country_language) """ Regex matches 0-2 scales """ if zero_to_two_pattern.match(unidecode.unidecode(answer)): return process_zero_to_x_scale(df, '2', answer, survey_item_prefix, study, module, item_name, country_language) else: if len(answer.split(' ')) <= 3 or string_has_numbers( answer) == False: if string_has_numbers(answer) == False and len( answer.split(' ')) > 3: is_scale, scale_items = process_answer_without_numbers( answer) if is_scale == False: data = { 'survey_item_ID': ut.update_survey_item_id(survey_item_prefix), 'Study': study, 'module': module, 'item_type': 'RESPONSE', 'item_name': item_name, 'item_value': None, 'text': answer } df = df.append(data, ignore_index=True) df = include_special_answer_category( survey_item_prefix, study, item_name, module, country_language, df) return df else: for i, scale_item in enumerate(scale_items): data = { 'survey_item_ID': ut.update_survey_item_id(survey_item_prefix), 'Study': study, 'module': module, 'item_type': 'RESPONSE', 'item_name': item_name, 'item_value': i, 'text': scale_item } df = df.append(data, ignore_index=True) df = include_special_answer_category( survey_item_prefix, study, item_name, module, country_language, df) return df elif currency in answer: d = dict() d = recursive_split_income_question(answer, study, country_language) if d: for k, v in list(d.items()): if k == 'pre': data = { 'survey_item_ID': ut.update_survey_item_id(survey_item_prefix), 'Study': study, 'module': module, 'item_type': 'RESPONSE', 'item_name': item_name, 'item_value': None, 'text': v } df = df.append(data, ignore_index=True) else: data = { 'survey_item_ID': ut.update_survey_item_id(survey_item_prefix), 'Study': study, 'module': module, 'item_type': 'RESPONSE', 'item_name': item_name, 'item_value': k, 'text': v } df = df.append(data, ignore_index=True) df = include_special_answer_category( survey_item_prefix, study, item_name, module, country_language, df) return df else: d = dict() answer = eliminate_dots(answer) # flag_zero, flag_begins_with_zero, flag_parentheses, flag_begins_with_number if re.compile('(\s+)?(00\s+)', re.IGNORECASE).findall(answer): if re.compile('(^00\s+)', re.IGNORECASE).findall(answer): d = recursive_split(answer, True, True, False, True) else: d = recursive_split(answer, True, True, False, False) elif re.compile('(\s+)?(-1\s+)', re.IGNORECASE).findall(answer): d = recursive_split_plus_minus_scale(answer) elif re.compile('(\s+)?(01\s+)', re.IGNORECASE).findall(answer): if re.compile('(^01\s+)', re.IGNORECASE).findall(answer): d = recursive_split(answer, True, False, False, True) else: d = recursive_split(answer, True, False, False, False) elif re.compile('(\s+)?(0\s+)', re.IGNORECASE).findall(answer): if re.compile('(^0\s+)', re.IGNORECASE).findall(answer): d = recursive_split(answer, False, True, False, True) else: d = recursive_split(answer, False, True, False, False) elif re.compile('(\s+)?(1\)\s+)', re.IGNORECASE).findall(answer): if re.compile('(^1\)\s+)', re.IGNORECASE).findall(answer): d = recursive_split(answer, False, False, True, True) else: d = recursive_split(answer, False, False, True, False) elif re.compile('(\s+)?(1\s+)', re.IGNORECASE).findall(answer): if re.compile('(^1\s+)', re.IGNORECASE).findall(answer): d = recursive_split(answer, False, False, False, True) else: d = recursive_split(answer, False, False, False, False) else: if "(entre '0' et '10')" in answer: data = { 'survey_item_ID': ut.update_survey_item_id(survey_item_prefix), 'Study': study, 'module': module, 'item_type': 'RESPONSE', 'item_name': item_name, 'item_value': None, 'text': answer } df = df.append(data, ignore_index=True) df = include_special_answer_category( survey_item_prefix, study, item_name, module, country_language, df) else: print('NO MATCHES', answer) if d: for k, v in list(d.items()): data = { 'survey_item_ID': ut.update_survey_item_id(survey_item_prefix), 'Study': study, 'module': module, 'item_type': 'RESPONSE', 'item_name': item_name, 'item_value': k, 'text': v } df = df.append(data, ignore_index=True) df = include_special_answer_category( survey_item_prefix, study, item_name, module, country_language, df) return df
def process_zero_to_x_scale(df, higher_side, answer, survey_item_prefix, study, module, item_name, country_language): """ Processes scales ranging from 0-x. Args: param1 df (pandas dataframe): dataframe to store processed questionnaire data. param2 higher_side (string): higher value of the scale. param3 survey_item_prefix (string): prefix of survey_item_ID. param4 study (string): study metadata parameter, retrieved in previous steps. param5 module (string): module metadata parameter, retrieved in previous steps. param6 item_name (string): item_name metadata parameter, retrieved in previous steps. param7 country_language (string): module metadata parameter, embedded in input file name. Returns: updated df_questionnaire (pandas dataframe), including new answer segments. """ answer = eliminate_dots(answer) if re.compile('(00)', re.IGNORECASE).findall(answer): first_part = answer.split('01') first_part_clean = re.sub("^00 ", "", first_part[0]) if higher_side == '10': final_part = first_part[1].split('10') else: final_part = first_part[1].split('0' + higher_side) final_part_clean = re.sub("^\s", "", final_part[1]) else: final_part = answer.split(higher_side) final_part_clean = re.sub("^\s", "", final_part[1]) first_part = final_part[0].split('1') first_part_clean = re.sub("^0 ", "", first_part[0]) first_part_clean = re.sub("\s$", "", first_part_clean) """ first part of the scale """ data = { 'survey_item_ID': ut.update_survey_item_id(survey_item_prefix), 'Study': study, 'module': module, 'item_type': 'RESPONSE', 'item_name': item_name, 'item_value': '0', 'text': first_part_clean } df = df.append(data, ignore_index=True) """ final part of the scale """ data = { 'survey_item_ID': ut.update_survey_item_id(survey_item_prefix), 'Study': study, 'module': module, 'item_type': 'RESPONSE', 'item_name': item_name, 'item_value': higher_side, 'text': final_part_clean } df = df.append(data, ignore_index=True) df = include_special_answer_category(survey_item_prefix, study, item_name, module, country_language, df) return df
def process_zero_to_ten_with_middle_text_scale(df, answer, survey_item_prefix, study, module, item_name, country_language): """ Processes scales ranging from 0-10, with text in the middle value. Args: param1 df (pandas dataframe): dataframe to store processed questionnaire data. param2 survey_item_prefix (string): prefix of survey_item_ID. param3 study (string): study metadata parameter, retrieved in previous steps. param4 module (string): module metadata parameter, retrieved in previous steps. param5 item_name (string): item_name metadata parameter, retrieved in previous steps. param6 country_language (string): module metadata parameter, embedded in input file name. Returns: updated df_questionnaire (pandas dataframe), including new answer segments. """ answer = eliminate_dots(answer) if re.compile('(00)', re.IGNORECASE).findall(answer): first_part = answer.split('01') first_part_clean = re.sub("^00 ", "", first_part[0]) mid_part = first_part[1].split('05') mid_part_part = mid_part[1].split('06') mid_part_clean = re.sub("^\s+", "", mid_part_part[0]) mid_part_clean = re.sub("\s+$", "", mid_part_clean) final_part = mid_part_part[1] final_part = final_part.split('10') final_part_clean = re.sub("^\s+", "", final_part[1]) """ first part of the scale """ data = { 'survey_item_ID': ut.update_survey_item_id(survey_item_prefix), 'Study': study, 'module': module, 'item_type': 'RESPONSE', 'item_name': item_name, 'item_value': '0', 'text': first_part_clean } df = df.append(data, ignore_index=True) """ middle """ data = { 'survey_item_ID': ut.update_survey_item_id(survey_item_prefix), 'Study': study, 'module': module, 'item_type': 'RESPONSE', 'item_name': item_name, 'item_value': '5', 'text': mid_part_clean } df = df.append(data, ignore_index=True) """ final part of the scale """ data = { 'survey_item_ID': ut.update_survey_item_id(survey_item_prefix), 'Study': study, 'module': module, 'item_type': 'RESPONSE', 'item_name': item_name, 'item_value': '10', 'text': mid_part_clean } df = df.append(data, ignore_index=True) df = include_special_answer_category(survey_item_prefix, study, item_name, module, country_language, df) return df
def extract_wis_data(df, df_questionnaire, study): """ Extracts and preprocesses WIS data from df, attibuting MCSQ metadata (and also harmonizing metadata e.g. item names, item types, when necessary). Args: param1 df (pandas dataframe): the input data in a dataframe representation. param2 df_questionnaire (pandas dataframe): a dataframe to hold the processed questionnaire data. param3 study (string): the name of the study, embedded in the WIS export filename. Returns: the df_questionnaire (pandas dataframe) with the preprocessed data. """ survey_item_prefix_source = instantiate_survey_item_prefix(study, 'source') survey_item_prefix_eng = instantiate_survey_item_prefix(study, 'en_GB') survey_item_prefix_fre = instantiate_survey_item_prefix(study, 'fr_FR') survey_item_prefix_por = instantiate_survey_item_prefix(study, 'pt_PT') survey_item_prefix_rus = instantiate_survey_item_prefix(study, 'ru_RU') survey_item_prefix_ger = instantiate_survey_item_prefix(study, 'de_DE') survey_item_prefix_spa = instantiate_survey_item_prefix(study, 'es_ES') survey_item_prefix_nor = instantiate_survey_item_prefix(study, 'no_NO') survey_item_prefix_cze = instantiate_survey_item_prefix(study, 'cs_CZ') df['NEXT_ITEM_TYPE'] = df['ITEM_TYPE'].shift(-1) flag = 0 for i, row in df.iterrows(): if isinstance( row['PAGE'], str) and row['PAGE'] != 'ALERT_1' and row['PAGE'] != 'ALERT_2': item_name = simplify_item_name( row['UNIQUE IDENTIFIER (PER SURVEY)']) item_value = row['VALUES of VAR'] wis_item_type = row['ITEM_TYPE'] item_type = harmonize_item_type(wis_item_type) if str( row['source'] ) != 'TRANSLATION IN TASK_API' and 'NO TRANSLATION' not in str( row['en_GB']): if isinstance(row['source'], str) or isinstance( row['source'], int): if wis_item_type == 'matrix question' and row[ 'NEXT_ITEM_TYPE'] != 'matrix option': if flag == 0: count = 0 flag = 1 else: count = count + 1 df_questionnaire.iloc[ -1, df_questionnaire.columns.get_loc( 'item_name')] = item_name + '_' + str(count) df['item_name'] = df[ 'UNIQUE IDENTIFIER (PER SURVEY)'].replace( to_replace=row['LAST_ITEM_NAME'], value=item_name + '_introD') df_by_item_name = df[ df['UNIQUE IDENTIFIER (PER SURVEY)'] == row['UNIQUE IDENTIFIER (PER SURVEY)']] df_by_item_name_responses = df_by_item_name[ df_by_item_name['ITEM_TYPE'] == 'matrix option'] if df_questionnaire.empty: survey_item_id_source = ut.get_survey_item_id( survey_item_prefix_source) else: survey_item_id_source = ut.update_survey_item_id( survey_item_prefix_source) data = { 'Study': study, 'module': row['PAGE'], 'item_type': item_type, 'wis_item_type': wis_item_type, 'item_name': item_name + '_' + str(count), 'item_value': item_value, 'ENG_SOURCE_survey_item_ID': survey_item_id_source, 'ENG_SOURCE_text': row['source'], 'ENG_GB_survey_item_ID': ut.get_survey_item_id(survey_item_prefix_eng), 'ENG_GB_text': row['en_GB'], 'FRE_FR_survey_item_ID': ut.get_survey_item_id(survey_item_prefix_fre), 'FRE_FR_text': row['fr_FR'], 'POR_PT_survey_item_ID': ut.get_survey_item_id(survey_item_prefix_por), 'POR_PT_text': row['pt_PT'], 'RUS_RU_survey_item_ID': ut.get_survey_item_id(survey_item_prefix_rus), 'RUS_RU_text': row['ru_RU'], 'GER_DE_survey_item_ID': ut.get_survey_item_id(survey_item_prefix_ger), 'GER_DE_text': row['de_DE'], 'SPA_ES_survey_item_ID': ut.get_survey_item_id(survey_item_prefix_spa), 'SPA_ES_text': row['es_ES'], 'NOR_NO_survey_item_ID': ut.get_survey_item_id(survey_item_prefix_nor), 'NOR_NO_text': row['no_NO'], 'CZE_CZ_survey_item_ID': ut.get_survey_item_id(survey_item_prefix_cze), 'CZE_CZ_text': row['cs_CZ'] } df_questionnaire = df_questionnaire.append( data, ignore_index=True) for i, row in df_by_item_name_responses.iterrows(): survey_item_id_source = ut.update_survey_item_id( survey_item_prefix_source) item_name = simplify_item_name( row['UNIQUE IDENTIFIER (PER SURVEY)']) item_value = row['VALUES of VAR'] wis_item_type = row['ITEM_TYPE'] item_type = harmonize_item_type(wis_item_type) data = { 'Study': study, 'module': row['PAGE'], 'item_type': item_type, 'wis_item_type': wis_item_type, 'item_name': item_name + '_' + str(count), 'item_value': item_value, 'ENG_SOURCE_survey_item_ID': survey_item_id_source, 'ENG_SOURCE_text': row['source'], 'ENG_GB_survey_item_ID': ut.get_survey_item_id(survey_item_prefix_eng), 'ENG_GB_text': row['en_GB'], 'FRE_FR_survey_item_ID': ut.get_survey_item_id(survey_item_prefix_fre), 'FRE_FR_text': row['fr_FR'], 'POR_PT_survey_item_ID': ut.get_survey_item_id(survey_item_prefix_por), 'POR_PT_text': row['pt_PT'], 'RUS_RU_survey_item_ID': ut.get_survey_item_id(survey_item_prefix_rus), 'RUS_RU_text': row['ru_RU'], 'GER_DE_survey_item_ID': ut.get_survey_item_id(survey_item_prefix_ger), 'GER_DE_text': row['de_DE'], 'SPA_ES_survey_item_ID': ut.get_survey_item_id(survey_item_prefix_spa), 'SPA_ES_text': row['es_ES'], 'NOR_NO_survey_item_ID': ut.get_survey_item_id(survey_item_prefix_nor), 'NOR_NO_text': row['no_NO'], 'CZE_CZ_survey_item_ID': ut.get_survey_item_id(survey_item_prefix_cze), 'CZE_CZ_text': row['cs_CZ'] } df_questionnaire = df_questionnaire.append( data, ignore_index=True) else: flag = 0 if df_questionnaire.empty: survey_item_id_source = ut.get_survey_item_id( survey_item_prefix_source) else: survey_item_id_source = ut.update_survey_item_id( survey_item_prefix_source) data = { 'Study': study, 'module': row['PAGE'], 'item_type': item_type, 'wis_item_type': wis_item_type, 'item_name': item_name, 'item_value': item_value, 'ENG_SOURCE_survey_item_ID': survey_item_id_source, 'ENG_SOURCE_text': row['source'], 'ENG_GB_survey_item_ID': ut.get_survey_item_id(survey_item_prefix_eng), 'ENG_GB_text': row['en_GB'], 'FRE_FR_survey_item_ID': ut.get_survey_item_id(survey_item_prefix_fre), 'FRE_FR_text': row['fr_FR'], 'POR_PT_survey_item_ID': ut.get_survey_item_id(survey_item_prefix_por), 'POR_PT_text': row['pt_PT'], 'RUS_RU_survey_item_ID': ut.get_survey_item_id(survey_item_prefix_rus), 'RUS_RU_text': row['ru_RU'], 'GER_DE_survey_item_ID': ut.get_survey_item_id(survey_item_prefix_ger), 'GER_DE_text': row['de_DE'], 'SPA_ES_survey_item_ID': ut.get_survey_item_id(survey_item_prefix_spa), 'SPA_ES_text': row['es_ES'], 'NOR_NO_survey_item_ID': ut.get_survey_item_id(survey_item_prefix_nor), 'NOR_NO_text': row['no_NO'], 'CZE_CZ_survey_item_ID': ut.get_survey_item_id(survey_item_prefix_cze), 'CZE_CZ_text': row['cs_CZ'] } df_questionnaire = df_questionnaire.append( data, ignore_index=True) return df_questionnaire
def post_process_request_response(df_request, df_response, df_post, survey_item_prefix): for i, row in df_request.iterrows(): if df_post.empty == False: last_row = df_post.iloc[-1] if last_row['text'] != row['text']: if 'QItem' in df_request.QuestionElement.unique(): item_name = adjust_item_name(row['QuestionElementNr'], row['item_name']) else: item_name = row['item_name'] if df_post.empty: survey_item_id = ut.get_survey_item_id(survey_item_prefix) else: survey_item_id = ut.update_survey_item_id( survey_item_prefix) data = { 'survey_item_ID': survey_item_id, 'Study': row['Study'], 'module': row['module'], 'item_type': row['item_type'], 'item_name': item_name, 'item_value': row['item_value'], 'text': row['text'] } df_post = df_post.append(data, ignore_index=True) if df_response.empty == False: if row['QuestionElement'] == 'QItem': for i, response_row in df_response.iterrows(): data = { 'survey_item_ID': ut.update_survey_item_id(survey_item_prefix), 'Study': response_row['Study'], 'module': response_row['module'], 'item_type': response_row['item_type'], 'item_name': item_name, 'item_value': response_row['item_value'], 'text': response_row['text'] } df_post = df_post.append(data, ignore_index=True) else: if 'QItem' in df_request.QuestionElement.unique(): item_name = adjust_item_name(row['QuestionElementNr'], row['item_name']) else: item_name = row['item_name'] if df_post.empty: survey_item_id = ut.get_survey_item_id(survey_item_prefix) else: survey_item_id = ut.update_survey_item_id(survey_item_prefix) data = { 'survey_item_ID': survey_item_id, 'Study': row['Study'], 'module': row['module'], 'item_type': row['item_type'], 'item_name': item_name, 'item_value': row['item_value'], 'text': row['text'] } df_post = df_post.append(data, ignore_index=True) if df_response.empty == False: if row['QuestionElement'] == 'QItem': for i, response_row in df_response.iterrows(): data = { 'survey_item_ID': ut.update_survey_item_id(survey_item_prefix), 'Study': response_row['Study'], 'module': response_row['module'], 'item_type': response_row['item_type'], 'item_name': item_name, 'item_value': response_row['item_value'], 'text': response_row['text'] } df_post = df_post.append(data, ignore_index=True) if df_response.empty == False: last_row = df_post.iloc[-1] if last_row['item_type'] != 'RESPONSE': for i, response_row in df_response.iterrows(): data = { 'survey_item_ID': ut.update_survey_item_id(survey_item_prefix), 'Study': response_row['Study'], 'module': response_row['module'], 'item_type': response_row['item_type'], 'item_name': last_row['item_name'], 'item_value': response_row['item_value'], 'text': response_row['text'] } df_post = df_post.append(data, ignore_index=True) return df_post
def main(filename): extract_source = 0 """ Parse the input XML file by filename """ file = str(filename) tree = ET.parse(file) root = tree.getroot() """ Create a dictionary containing parent-child relations of the parsed tree """ parent_map = dict((c, p) for p in tree.getiterator() for c in p) ess_questions_instructions = root.findall('.//questionnaire/questions') ess_answers = root.findall('.//questionnaire/answers') ess_showcards = root.findall('.//questionnaire/showcards') df_questionnaire, survey_item_prefix, study, country_language, splitter = set_initial_structures( filename, extract_source) ess_special_answer_categories = instantiate_special_answer_category_object( country_language) if 'GER_AT' in filename: ess_special_answer_categories.refuse[0] = 'Verweigert' ess_special_answer_categories.dont_know[0] = 'Weiß nicht' if 'GER_CH' in filename: ess_special_answer_categories.refuse[0] = 'Antwort verweigert' ess_special_answer_categories.dont_know[0] = 'Weiss nicht' if 'GER_DE' in filename: ess_special_answer_categories.refuse[0] = 'Antwort verweigert' if 'NOR_NO' in filename: ess_special_answer_categories.refuse[0] = 'Nekter' df_question_instruction = pd.DataFrame( columns=['answer_id', 'item_name', 'item_type', 'text']) df_answers = pd.DataFrame( columns=['answer_id', 'item_name', 'text', 'item_value']) item_value = None df_question_instruction = process_question_instruction_node( ess_questions_instructions, df_question_instruction, parent_map, splitter, country_language, extract_source) df_answers = process_answer_node(ess_answers, df_answers, parent_map, ess_special_answer_categories, extract_source) unique_item_names_question_instruction = df_question_instruction.item_name.unique( ) for unique_item_name in unique_item_names_question_instruction: df_question_instruction_by_item_name = df_question_instruction[ df_question_instruction['item_name'].str.lower() == unique_item_name.lower()] df_answers_by_item_name = df_answers[ df_answers['item_name'].str.lower() == unique_item_name.lower()] module = retrieve_item_module(unique_item_name, study) last_item_name = '' for i, row in df_question_instruction_by_item_name.iterrows(): item_name = row['item_name'] if item_name == 'Instruction' or item_name == 'Intro': item_name = last_item_name if 'Row ' not in item_name and item_name != 'CI' and item_name != 'outro' and 'istration Note' not in item_name and item_name != 'box': if df_questionnaire.empty: survey_item_id = ut.get_survey_item_id(survey_item_prefix) else: survey_item_id = ut.update_survey_item_id( survey_item_prefix) if check_if_segment_is_instruction(row['text'], country_language): item_type = 'INSTRUCTION' else: item_type = row['item_type'] data = { 'survey_item_ID': survey_item_id, 'Study': study, 'module': module, 'item_type': item_type, 'item_name': item_name, 'item_value': None, 'text': row['text'] } df_questionnaire = df_questionnaire.append(data, ignore_index=True) last_item_name = item_name if df_answers_by_item_name.empty == False: for j, row in df_answers_by_item_name.iterrows(): if row['item_value'] == 'None': item_value = None else: item_value = row['item_value'] data = { 'survey_item_ID': ut.update_survey_item_id(survey_item_prefix), 'Study': study, 'module': module, 'item_type': row['item_type'], 'item_name': item_name, 'item_value': item_value, 'text': row['text'] } df_questionnaire = df_questionnaire.append(data, ignore_index=True) # df_question_instruction.to_csv('questions.csv', encoding='utf-8-sig', index=False) # df_answers.to_csv('answers.csv', encoding='utf-8-sig', index=False) df_questionnaire.to_csv(survey_item_prefix[:-1] + '.csv', encoding='utf-8-sig', sep='\t', index=False)
def process_question_segment(raw_item, survey_item_prefix, study, item_name, df_questionnaire, splitter, country_language): """ Extracts and processes the question segments from a raw item. The question segments are always between the {QUESTION} and {ANSWERS} tags, for instance: G2 {QUESTION} Per a ell és important ser ric. Vol tenir molts diners i coses cares. {ANSWERS} Se sembla molt a mi Se sembla a mi Se sembla una mica a mi Se sembla poc a mi No se sembla a mi No se sembla gens a mi Args: param1 raw_item (list): raw survey item, retrieved in previous steps. param2 survey_item_prefix (string): prefix of survey_item_ID. param3 study (string): metadata parameter about study embedded in the file name. param4 item_name (string): item_name metadata parameter, retrieved in previous steps. param5 df_questionnaire (pandas dataframe): pandas dataframe to store questionnaire data. param6 splitter (NLTK object): sentence segmentation from NLTK library. param7 country_language (string): country_language metadata, embedded in file name. Returns: updated df_questionnaire when new valid question segments are included, or df_questionnaire in the same state it was when no new valid question segments were included. """ print(raw_item) index_question_tag = raw_item.index('{QUESTION}') index_answer_tag = raw_item.index('{ANSWERS}') question_segment = raw_item[index_question_tag + 1:index_answer_tag] for item in question_segment: item = clean_text(item) item = expand_interviewer_abbreviations(item, country_language) if item != '': sentences = splitter.tokenize(item) for sentence in sentences: if df_questionnaire.empty: survey_item_id = ut.get_survey_item_id(survey_item_prefix) else: survey_item_id = ut.update_survey_item_id( survey_item_prefix) if check_if_segment_is_instruction(sentence, country_language): item_type = 'INSTRUCTION' else: item_type = 'REQUEST' data = { "survey_item_ID": survey_item_id, 'Study': study, 'module': retrieve_item_module(item_name, study), 'item_type': item_type, 'item_name': item_name, 'item_value': None, 'text': sentence } df_questionnaire = df_questionnaire.append(data, ignore_index=True) return df_questionnaire
def process_answer_segment(raw_item, survey_item_prefix, study, item_name, df_questionnaire, country_language): """ Extracts and processes the answer segments from a raw item. The answer segments are always after the {ANSWERS} tag. If there are no answer segments, then the answer segment is the corresponding to 'write down' for the target language. Args: param1 raw_item (list): raw survey item, retrieved in previous steps. param2 survey_item_prefix (string): prefix of survey_item_ID. param3 study (string): metadata parameter about study embedded in the file name. param4 item_name (string): item_name metadata parameter, retrieved in previous steps. param5 df_questionnaire (pandas dataframe): pandas dataframe to store questionnaire data. param6 country_language (string): country_language metadata, embedded in file name. Returns: updated df_questionnaire when new valid answer segments are included, or df_questionnaire in the same state it was when no new valid answer segments were included. """ index_answer_tag = raw_item.index('{ANSWERS}') answer_segment = raw_item[index_answer_tag + 1:] ess_special_answer_categories = instantiate_special_answer_category_object( country_language) responses = [] if not answer_segment: survey_item_id = ut.update_survey_item_id(survey_item_prefix) answer_text, item_value = ess_special_answer_categories.write_down[ 0], ess_special_answer_categories.write_down[1] data = { "survey_item_ID": survey_item_id, 'Study': study, 'module': retrieve_item_module(item_name, study), 'item_type': 'RESPONSE', 'item_name': item_name, 'item_value': item_value, 'text': answer_text } df_questionnaire = df_questionnaire.append(data, ignore_index=True) for i, item in enumerate(answer_segment): survey_item_id = ut.update_survey_item_id(survey_item_prefix) answer_text, answer_value = clean_answer( item, ess_special_answer_categories) if answer_text: if answer_value: item_value = str(answer_value) else: item_value = str(i) responses.append(answer_text) data = { "survey_item_ID": survey_item_id, 'Study': study, 'module': retrieve_item_module(item_name, study), 'item_type': 'RESPONSE', 'item_name': item_name, 'item_value': item_value, 'text': answer_text } df_questionnaire = df_questionnaire.append(data, ignore_index=True) if ess_special_answer_categories.dont_know[0] not in responses: survey_item_id = ut.update_survey_item_id(survey_item_prefix) data = { "survey_item_ID": survey_item_id, 'Study': study, 'module': retrieve_item_module(item_name, study), 'item_type': 'RESPONSE', 'item_name': item_name, 'item_value': ess_special_answer_categories.dont_know[1], 'text': ess_special_answer_categories.dont_know[0] } df_questionnaire = df_questionnaire.append(data, ignore_index=True) if ess_special_answer_categories.refuse[0] not in responses: survey_item_id = ut.update_survey_item_id(survey_item_prefix) data = { "survey_item_ID": survey_item_id, 'Study': study, 'module': retrieve_item_module(item_name, study), 'item_type': 'RESPONSE', 'item_name': item_name, 'item_value': ess_special_answer_categories.refuse[1], 'text': ess_special_answer_categories.refuse[0] } df_questionnaire = df_questionnaire.append(data, ignore_index=True) return df_questionnaire
def process_intro_segment(raw_item, survey_item_prefix, study, item_name, df_questionnaire, splitter): """ Extracts and processes the introduction segments from a raw item. The introduction segments are always between the item name and {QUESTION} tag, for instance: {INTRO} Ara m'agradaria fer-li algunes preguntes sobre política i el govern. B1 {QUESTION} En quina mesura diria vostè que l'interessa la política? Vostè diria que l'interessa... {ANSWERS} Molt Bastant Poc Gens Args: param1 raw_item (list): raw survey item, retrieved in previous steps. param2 survey_item_prefix (string): prefix of survey_item_ID. param3 study (string): metadata parameter about study embedded in the file name. param4 item_name (string): item_name metadata parameter, retrieved in previous steps. param5 df_questionnaire (pandas dataframe): pandas dataframe to store questionnaire data. param6 splitter (NLTK object): sentence segmentation from NLTK library. Returns: updated df_questionnaire when new valid introduction segments are included, or df_questionnaire in the same state it was when no new valid introduction segments were included. """ index_intro_tag = raw_item.index('{INTRO}') index_question_tag = raw_item.index('{QUESTION}') intro_segment = raw_item[index_intro_tag + 1:index_question_tag] for item in intro_segment: item = clean_text(item) if item != '': sentences = splitter.tokenize(item) for sentence in sentences: if df_questionnaire.empty: survey_item_id = ut.get_survey_item_id(survey_item_prefix) else: survey_item_id = ut.update_survey_item_id( survey_item_prefix) data = { "survey_item_ID": survey_item_id, 'Study': study, 'module': retrieve_item_module(item_name, study), 'item_type': 'INTRODUCTION', 'item_name': item_name, 'item_value': None, 'text': sentence } df_questionnaire = df_questionnaire.append(data, ignore_index=True) return df_questionnaire