Exemple #1
0
def post_process_instruction(df_instruction, df_request, df_post,
                             survey_item_prefix):
    for i, row in df_instruction.iterrows():
        if df_post.empty == False:
            last_row = df_post.iloc[-1]
            if last_row['text'] != row['text']:
                if 'QItem' in df_request.QuestionElement.unique():
                    rowqitem = df_request[df_request['QuestionElement'] ==
                                          'QItem']
                    item_name = adjust_item_name(
                        rowqitem['QuestionElementNr'].values[0],
                        rowqitem['item_name'].values[0])
                else:
                    item_name = row['item_name']

                if df_post.empty:
                    survey_item_id = ut.get_survey_item_id(survey_item_prefix)
                else:
                    survey_item_id = ut.update_survey_item_id(
                        survey_item_prefix)

                data = {
                    'survey_item_ID': survey_item_id,
                    'Study': row['Study'],
                    'module': row['module'],
                    'item_type': row['item_type'],
                    'item_name': item_name,
                    'item_value': row['item_value'],
                    'text': row['text']
                }
                df_post = df_post.append(data, ignore_index=True)
        else:
            if 'QItem' in df_request.QuestionElement.unique():
                rowqitem = df_request[df_request['QuestionElement'] == 'QItem']
                item_name = adjust_item_name(
                    rowqitem['QuestionElementNr'].values[0],
                    rowqitem['item_name'].values[0])
            else:
                item_name = row['item_name']
                if df_post.empty:
                    survey_item_id = ut.get_survey_item_id(survey_item_prefix)
                else:
                    survey_item_id = ut.update_survey_item_id(
                        survey_item_prefix)

                data = {
                    'survey_item_ID': survey_item_id,
                    'Study': row['Study'],
                    'module': row['module'],
                    'item_type': row['item_type'],
                    'item_name': item_name,
                    'item_value': row['item_value'],
                    'text': row['text']
                }
                df_post = df_post.append(data, ignore_index=True)

    return df_post
Exemple #2
0
def preprocess_instruction_segment(row, df_questionnaire, survey_item_prefix,
                                   splitter):
    """
	Extracts and processes the instruction segments from the input file.

	Args:
		param1 row (pandas dataframe object): dataframe row being currently analyzed.
		param2 df_questionnaire (pandas dataframe): pandas dataframe to store questionnaire data.
		param3 survey_item_prefix (string): prefix of survey_item_ID.
		param4 splitter (NLTK object): NLTK object for sentence segmentation instantiated in accordance to the language.

	Returns:
		updated df_questionnaire with new valid instruction segments.
	"""
    raw_item = replace_abbreviations_and_fills(row['text'])
    sentences = splitter.tokenize(raw_item)
    for sentence in sentences:
        if df_questionnaire.empty:
            survey_item_id = ut.get_survey_item_id(survey_item_prefix)
        else:
            survey_item_id = ut.update_survey_item_id(survey_item_prefix)

        data = {
            "survey_item_ID": survey_item_id,
            'Study': survey_item_prefix[:-1],
            'module': retrieve_module_from_item_name(row['name']),
            'item_type': 'INSTRUCTION',
            'item_name': row['name'],
            'item_value': None,
            'text': sentence
        }
        df_questionnaire = df_questionnaire.append(data, ignore_index=True)

    return df_questionnaire
Exemple #3
0
def questionnaire_post_processing(df_with_intro, survey_item_prefix):
    ut.reset_initial_sufix()

    df_post = pd.DataFrame(columns=[
        'survey_item_ID', 'Study', 'module', 'item_type', 'item_name',
        'item_value', 'text'
    ])
    unique_item_names = df_with_intro.item_name.unique()

    for unique_item_name in unique_item_names:
        df_by_item_name = df_with_intro[df_with_intro['item_name'] ==
                                        unique_item_name]

        df_instruction = df_by_item_name[df_by_item_name['item_type'] ==
                                         'INSTRUCTION']
        df_introduction = df_by_item_name[df_by_item_name['item_type'] ==
                                          'INTRODUCTION']
        df_request = df_by_item_name[df_by_item_name['item_type'] == 'REQUEST']
        df_response = df_by_item_name[df_by_item_name['item_type'] ==
                                      'RESPONSE']

        del df_response['QuestionElement']

        if df_instruction.empty == False:
            df_post = post_process_instruction(df_instruction, df_request,
                                               df_post, survey_item_prefix)

        if df_introduction.empty == False:
            for i, row in df_introduction.iterrows():
                if 'QItem' in df_request.QuestionElement.unique():
                    rowqitem = df_request[df_request['QuestionElement'] ==
                                          'QItem']
                    item_name = adjust_item_name(
                        rowqitem['QuestionElementNr'].values[0],
                        rowqitem['item_name'].values[0])
                else:
                    item_name = row['item_name']

                if df_post.empty:
                    survey_item_id = ut.get_survey_item_id(survey_item_prefix)
                else:
                    survey_item_id = ut.update_survey_item_id(
                        survey_item_prefix)

                data = {
                    'survey_item_ID': survey_item_id,
                    'Study': row['Study'],
                    'module': row['module'],
                    'item_type': row['item_type'],
                    'item_name': item_name,
                    'item_value': row['item_value'],
                    'text': row['text']
                }
                df_post = df_post.append(data, ignore_index=True)

        df_post = post_process_request_response(df_request, df_response,
                                                df_post, survey_item_prefix)

    return df_post
Exemple #4
0
def process_request(df, row, survey_item_prefix, item_name, module, splitter):
    """
	Processes request segments.

	Args:
		param1 df (pandas dataframe): dataframe to store processed questionnaire data.
		param2 row (pandas dataframe row): row of dataframe with contents of the input 
		file being analyzed in outer loop.
		param3 survey_item_prefix (string): prefix of survey_item_ID.
		param4 item_name (string): item_name metadata parameter, retrieved in previous steps.
		param5 module (string): module metadata parameter, retrieved in previous steps.
		param6 splitter (NLTK object): sentence segmentation from NLTK library.

	Returns:
		a pandas dataframe with preprocessed request segments.
	"""
    request = row['Request for answer text']
    study, country_language = get_country_language_and_study_info(
        survey_item_prefix)

    if request != '.' and isinstance(request, str):
        request = clean_text(request)
        sentences = splitter.tokenize(request)
        for sentence in sentences:
            if sentence != '...':
                if df.empty:
                    survey_item_id = ut.get_survey_item_id(survey_item_prefix)
                else:
                    survey_item_id = ut.update_survey_item_id(
                        survey_item_prefix)

                if check_if_segment_is_instruction(sentence, country_language):
                    item_type = 'INSTRUCTION'
                else:
                    item_type = 'REQUEST'

                data = {
                    'survey_item_ID': survey_item_id,
                    'Study': study,
                    'module': module,
                    'item_type': item_type,
                    'item_name': item_name,
                    'item_value': None,
                    'text': sentence
                }
                df = df.append(data, ignore_index=True)

    return df
Exemple #5
0
def extract_wis_data(df, df_questionnaire, study):
    """
	Extracts and preprocesses WIS data from df, attibuting MCSQ metadata (and also harmonizing metadata e.g. item names, item types, when necessary). 

	Args:
		param1 df (pandas dataframe): the input data in a dataframe representation.
		param2 df_questionnaire (pandas dataframe): a dataframe to hold the processed questionnaire data.
		param3 study (string): the name of the study, embedded in the WIS export filename.

	Returns:
		the df_questionnaire (pandas dataframe) with the preprocessed data.

	"""
    survey_item_prefix_source = instantiate_survey_item_prefix(study, 'source')
    survey_item_prefix_eng = instantiate_survey_item_prefix(study, 'en_GB')
    survey_item_prefix_fre = instantiate_survey_item_prefix(study, 'fr_FR')
    survey_item_prefix_por = instantiate_survey_item_prefix(study, 'pt_PT')
    survey_item_prefix_rus = instantiate_survey_item_prefix(study, 'ru_RU')
    survey_item_prefix_ger = instantiate_survey_item_prefix(study, 'de_DE')
    survey_item_prefix_spa = instantiate_survey_item_prefix(study, 'es_ES')
    survey_item_prefix_nor = instantiate_survey_item_prefix(study, 'no_NO')
    survey_item_prefix_cze = instantiate_survey_item_prefix(study, 'cs_CZ')

    df['NEXT_ITEM_TYPE'] = df['ITEM_TYPE'].shift(-1)

    flag = 0
    for i, row in df.iterrows():
        if isinstance(
                row['PAGE'],
                str) and row['PAGE'] != 'ALERT_1' and row['PAGE'] != 'ALERT_2':
            item_name = simplify_item_name(
                row['UNIQUE IDENTIFIER (PER SURVEY)'])
            item_value = row['VALUES of VAR']
            wis_item_type = row['ITEM_TYPE']
            item_type = harmonize_item_type(wis_item_type)

            if str(
                    row['source']
            ) != 'TRANSLATION IN TASK_API' and 'NO TRANSLATION' not in str(
                    row['en_GB']):
                if isinstance(row['source'], str) or isinstance(
                        row['source'], int):
                    if wis_item_type == 'matrix question' and row[
                            'NEXT_ITEM_TYPE'] != 'matrix option':
                        if flag == 0:
                            count = 0
                            flag = 1
                        else:
                            count = count + 1

                        df_questionnaire.iloc[
                            -1,
                            df_questionnaire.columns.get_loc(
                                'item_name')] = item_name + '_' + str(count)

                        df['item_name'] = df[
                            'UNIQUE IDENTIFIER (PER SURVEY)'].replace(
                                to_replace=row['LAST_ITEM_NAME'],
                                value=item_name + '_introD')

                        df_by_item_name = df[
                            df['UNIQUE IDENTIFIER (PER SURVEY)'] ==
                            row['UNIQUE IDENTIFIER (PER SURVEY)']]
                        df_by_item_name_responses = df_by_item_name[
                            df_by_item_name['ITEM_TYPE'] == 'matrix option']

                        if df_questionnaire.empty:
                            survey_item_id_source = ut.get_survey_item_id(
                                survey_item_prefix_source)
                        else:
                            survey_item_id_source = ut.update_survey_item_id(
                                survey_item_prefix_source)

                        data = {
                            'Study':
                            study,
                            'module':
                            row['PAGE'],
                            'item_type':
                            item_type,
                            'wis_item_type':
                            wis_item_type,
                            'item_name':
                            item_name + '_' + str(count),
                            'item_value':
                            item_value,
                            'ENG_SOURCE_survey_item_ID':
                            survey_item_id_source,
                            'ENG_SOURCE_text':
                            row['source'],
                            'ENG_GB_survey_item_ID':
                            ut.get_survey_item_id(survey_item_prefix_eng),
                            'ENG_GB_text':
                            row['en_GB'],
                            'FRE_FR_survey_item_ID':
                            ut.get_survey_item_id(survey_item_prefix_fre),
                            'FRE_FR_text':
                            row['fr_FR'],
                            'POR_PT_survey_item_ID':
                            ut.get_survey_item_id(survey_item_prefix_por),
                            'POR_PT_text':
                            row['pt_PT'],
                            'RUS_RU_survey_item_ID':
                            ut.get_survey_item_id(survey_item_prefix_rus),
                            'RUS_RU_text':
                            row['ru_RU'],
                            'GER_DE_survey_item_ID':
                            ut.get_survey_item_id(survey_item_prefix_ger),
                            'GER_DE_text':
                            row['de_DE'],
                            'SPA_ES_survey_item_ID':
                            ut.get_survey_item_id(survey_item_prefix_spa),
                            'SPA_ES_text':
                            row['es_ES'],
                            'NOR_NO_survey_item_ID':
                            ut.get_survey_item_id(survey_item_prefix_nor),
                            'NOR_NO_text':
                            row['no_NO'],
                            'CZE_CZ_survey_item_ID':
                            ut.get_survey_item_id(survey_item_prefix_cze),
                            'CZE_CZ_text':
                            row['cs_CZ']
                        }
                        df_questionnaire = df_questionnaire.append(
                            data, ignore_index=True)

                        for i, row in df_by_item_name_responses.iterrows():
                            survey_item_id_source = ut.update_survey_item_id(
                                survey_item_prefix_source)
                            item_name = simplify_item_name(
                                row['UNIQUE IDENTIFIER (PER SURVEY)'])
                            item_value = row['VALUES of VAR']
                            wis_item_type = row['ITEM_TYPE']
                            item_type = harmonize_item_type(wis_item_type)

                            data = {
                                'Study':
                                study,
                                'module':
                                row['PAGE'],
                                'item_type':
                                item_type,
                                'wis_item_type':
                                wis_item_type,
                                'item_name':
                                item_name + '_' + str(count),
                                'item_value':
                                item_value,
                                'ENG_SOURCE_survey_item_ID':
                                survey_item_id_source,
                                'ENG_SOURCE_text':
                                row['source'],
                                'ENG_GB_survey_item_ID':
                                ut.get_survey_item_id(survey_item_prefix_eng),
                                'ENG_GB_text':
                                row['en_GB'],
                                'FRE_FR_survey_item_ID':
                                ut.get_survey_item_id(survey_item_prefix_fre),
                                'FRE_FR_text':
                                row['fr_FR'],
                                'POR_PT_survey_item_ID':
                                ut.get_survey_item_id(survey_item_prefix_por),
                                'POR_PT_text':
                                row['pt_PT'],
                                'RUS_RU_survey_item_ID':
                                ut.get_survey_item_id(survey_item_prefix_rus),
                                'RUS_RU_text':
                                row['ru_RU'],
                                'GER_DE_survey_item_ID':
                                ut.get_survey_item_id(survey_item_prefix_ger),
                                'GER_DE_text':
                                row['de_DE'],
                                'SPA_ES_survey_item_ID':
                                ut.get_survey_item_id(survey_item_prefix_spa),
                                'SPA_ES_text':
                                row['es_ES'],
                                'NOR_NO_survey_item_ID':
                                ut.get_survey_item_id(survey_item_prefix_nor),
                                'NOR_NO_text':
                                row['no_NO'],
                                'CZE_CZ_survey_item_ID':
                                ut.get_survey_item_id(survey_item_prefix_cze),
                                'CZE_CZ_text':
                                row['cs_CZ']
                            }
                            df_questionnaire = df_questionnaire.append(
                                data, ignore_index=True)

                    else:
                        flag = 0
                        if df_questionnaire.empty:
                            survey_item_id_source = ut.get_survey_item_id(
                                survey_item_prefix_source)
                        else:
                            survey_item_id_source = ut.update_survey_item_id(
                                survey_item_prefix_source)

                        data = {
                            'Study':
                            study,
                            'module':
                            row['PAGE'],
                            'item_type':
                            item_type,
                            'wis_item_type':
                            wis_item_type,
                            'item_name':
                            item_name,
                            'item_value':
                            item_value,
                            'ENG_SOURCE_survey_item_ID':
                            survey_item_id_source,
                            'ENG_SOURCE_text':
                            row['source'],
                            'ENG_GB_survey_item_ID':
                            ut.get_survey_item_id(survey_item_prefix_eng),
                            'ENG_GB_text':
                            row['en_GB'],
                            'FRE_FR_survey_item_ID':
                            ut.get_survey_item_id(survey_item_prefix_fre),
                            'FRE_FR_text':
                            row['fr_FR'],
                            'POR_PT_survey_item_ID':
                            ut.get_survey_item_id(survey_item_prefix_por),
                            'POR_PT_text':
                            row['pt_PT'],
                            'RUS_RU_survey_item_ID':
                            ut.get_survey_item_id(survey_item_prefix_rus),
                            'RUS_RU_text':
                            row['ru_RU'],
                            'GER_DE_survey_item_ID':
                            ut.get_survey_item_id(survey_item_prefix_ger),
                            'GER_DE_text':
                            row['de_DE'],
                            'SPA_ES_survey_item_ID':
                            ut.get_survey_item_id(survey_item_prefix_spa),
                            'SPA_ES_text':
                            row['es_ES'],
                            'NOR_NO_survey_item_ID':
                            ut.get_survey_item_id(survey_item_prefix_nor),
                            'NOR_NO_text':
                            row['no_NO'],
                            'CZE_CZ_survey_item_ID':
                            ut.get_survey_item_id(survey_item_prefix_cze),
                            'CZE_CZ_text':
                            row['cs_CZ']
                        }
                        df_questionnaire = df_questionnaire.append(
                            data, ignore_index=True)

    return df_questionnaire
Exemple #6
0
def post_process_request_response(df_request, df_response, df_post,
                                  survey_item_prefix):
    for i, row in df_request.iterrows():
        if df_post.empty == False:
            last_row = df_post.iloc[-1]
            if last_row['text'] != row['text']:
                if 'QItem' in df_request.QuestionElement.unique():
                    item_name = adjust_item_name(row['QuestionElementNr'],
                                                 row['item_name'])
                else:
                    item_name = row['item_name']

                if df_post.empty:
                    survey_item_id = ut.get_survey_item_id(survey_item_prefix)
                else:
                    survey_item_id = ut.update_survey_item_id(
                        survey_item_prefix)

                data = {
                    'survey_item_ID': survey_item_id,
                    'Study': row['Study'],
                    'module': row['module'],
                    'item_type': row['item_type'],
                    'item_name': item_name,
                    'item_value': row['item_value'],
                    'text': row['text']
                }
                df_post = df_post.append(data, ignore_index=True)

                if df_response.empty == False:
                    if row['QuestionElement'] == 'QItem':
                        for i, response_row in df_response.iterrows():
                            data = {
                                'survey_item_ID':
                                ut.update_survey_item_id(survey_item_prefix),
                                'Study':
                                response_row['Study'],
                                'module':
                                response_row['module'],
                                'item_type':
                                response_row['item_type'],
                                'item_name':
                                item_name,
                                'item_value':
                                response_row['item_value'],
                                'text':
                                response_row['text']
                            }
                            df_post = df_post.append(data, ignore_index=True)
        else:
            if 'QItem' in df_request.QuestionElement.unique():
                item_name = adjust_item_name(row['QuestionElementNr'],
                                             row['item_name'])
            else:
                item_name = row['item_name']

            if df_post.empty:
                survey_item_id = ut.get_survey_item_id(survey_item_prefix)
            else:
                survey_item_id = ut.update_survey_item_id(survey_item_prefix)

            data = {
                'survey_item_ID': survey_item_id,
                'Study': row['Study'],
                'module': row['module'],
                'item_type': row['item_type'],
                'item_name': item_name,
                'item_value': row['item_value'],
                'text': row['text']
            }
            df_post = df_post.append(data, ignore_index=True)

            if df_response.empty == False:
                if row['QuestionElement'] == 'QItem':
                    for i, response_row in df_response.iterrows():
                        data = {
                            'survey_item_ID':
                            ut.update_survey_item_id(survey_item_prefix),
                            'Study':
                            response_row['Study'],
                            'module':
                            response_row['module'],
                            'item_type':
                            response_row['item_type'],
                            'item_name':
                            item_name,
                            'item_value':
                            response_row['item_value'],
                            'text':
                            response_row['text']
                        }
                        df_post = df_post.append(data, ignore_index=True)

    if df_response.empty == False:
        last_row = df_post.iloc[-1]
        if last_row['item_type'] != 'RESPONSE':
            for i, response_row in df_response.iterrows():
                data = {
                    'survey_item_ID':
                    ut.update_survey_item_id(survey_item_prefix),
                    'Study': response_row['Study'],
                    'module': response_row['module'],
                    'item_type': response_row['item_type'],
                    'item_name': last_row['item_name'],
                    'item_value': response_row['item_value'],
                    'text': response_row['text']
                }
                df_post = df_post.append(data, ignore_index=True)

    return df_post
Exemple #7
0
def main(filename):
    extract_source = 0
    """
	Parse the input XML file by filename
	"""
    file = str(filename)
    tree = ET.parse(file)
    root = tree.getroot()
    """
	Create a dictionary containing parent-child relations of the parsed tree
	"""
    parent_map = dict((c, p) for p in tree.getiterator() for c in p)
    ess_questions_instructions = root.findall('.//questionnaire/questions')
    ess_answers = root.findall('.//questionnaire/answers')
    ess_showcards = root.findall('.//questionnaire/showcards')

    df_questionnaire, survey_item_prefix, study, country_language, splitter = set_initial_structures(
        filename, extract_source)
    ess_special_answer_categories = instantiate_special_answer_category_object(
        country_language)

    if 'GER_AT' in filename:
        ess_special_answer_categories.refuse[0] = 'Verweigert'
        ess_special_answer_categories.dont_know[0] = 'Weiß nicht'

    if 'GER_CH' in filename:
        ess_special_answer_categories.refuse[0] = 'Antwort verweigert'
        ess_special_answer_categories.dont_know[0] = 'Weiss nicht'

    if 'GER_DE' in filename:
        ess_special_answer_categories.refuse[0] = 'Antwort verweigert'

    if 'NOR_NO' in filename:
        ess_special_answer_categories.refuse[0] = 'Nekter'

    df_question_instruction = pd.DataFrame(
        columns=['answer_id', 'item_name', 'item_type', 'text'])
    df_answers = pd.DataFrame(
        columns=['answer_id', 'item_name', 'text', 'item_value'])
    item_value = None

    df_question_instruction = process_question_instruction_node(
        ess_questions_instructions, df_question_instruction, parent_map,
        splitter, country_language, extract_source)
    df_answers = process_answer_node(ess_answers, df_answers, parent_map,
                                     ess_special_answer_categories,
                                     extract_source)

    unique_item_names_question_instruction = df_question_instruction.item_name.unique(
    )

    for unique_item_name in unique_item_names_question_instruction:

        df_question_instruction_by_item_name = df_question_instruction[
            df_question_instruction['item_name'].str.lower() ==
            unique_item_name.lower()]
        df_answers_by_item_name = df_answers[
            df_answers['item_name'].str.lower() == unique_item_name.lower()]
        module = retrieve_item_module(unique_item_name, study)

        last_item_name = ''
        for i, row in df_question_instruction_by_item_name.iterrows():
            item_name = row['item_name']

            if item_name == 'Instruction' or item_name == 'Intro':
                item_name = last_item_name

            if 'Row ' not in item_name and item_name != 'CI' and item_name != 'outro' and 'istration Note' not in item_name and item_name != 'box':
                if df_questionnaire.empty:
                    survey_item_id = ut.get_survey_item_id(survey_item_prefix)
                else:
                    survey_item_id = ut.update_survey_item_id(
                        survey_item_prefix)

                if check_if_segment_is_instruction(row['text'],
                                                   country_language):
                    item_type = 'INSTRUCTION'
                else:
                    item_type = row['item_type']

                data = {
                    'survey_item_ID': survey_item_id,
                    'Study': study,
                    'module': module,
                    'item_type': item_type,
                    'item_name': item_name,
                    'item_value': None,
                    'text': row['text']
                }
                df_questionnaire = df_questionnaire.append(data,
                                                           ignore_index=True)

                last_item_name = item_name

        if df_answers_by_item_name.empty == False:
            for j, row in df_answers_by_item_name.iterrows():
                if row['item_value'] == 'None':
                    item_value = None
                else:
                    item_value = row['item_value']

                data = {
                    'survey_item_ID':
                    ut.update_survey_item_id(survey_item_prefix),
                    'Study': study,
                    'module': module,
                    'item_type': row['item_type'],
                    'item_name': item_name,
                    'item_value': item_value,
                    'text': row['text']
                }
                df_questionnaire = df_questionnaire.append(data,
                                                           ignore_index=True)

    # df_question_instruction.to_csv('questions.csv', encoding='utf-8-sig', index=False)
    # df_answers.to_csv('answers.csv', encoding='utf-8-sig', index=False)
    df_questionnaire.to_csv(survey_item_prefix[:-1] + '.csv',
                            encoding='utf-8-sig',
                            sep='\t',
                            index=False)
Exemple #8
0
def process_question_segment(raw_item, survey_item_prefix, study, item_name,
                             df_questionnaire, splitter, country_language):
    """
	Extracts and processes the question segments from a raw item.
	The question segments are always between the {QUESTION} and {ANSWERS} tags, 
	for instance:

	G2
	{QUESTION}
	Per a ell és important ser ric. 
	Vol tenir molts diners i coses cares.

	{ANSWERS}
	Se sembla molt a mi
	Se sembla a mi
	Se sembla una mica a mi
	Se sembla poc a mi
	No se sembla a mi
	No se sembla gens a mi

	Args:
		param1 raw_item (list): raw survey item, retrieved in previous steps.
		param2 survey_item_prefix (string): prefix of survey_item_ID.
		param3 study (string): metadata parameter about study embedded in the file name.
		param4 item_name (string): item_name metadata parameter, retrieved in previous steps.
		param5 df_questionnaire (pandas dataframe): pandas dataframe to store questionnaire data.
		param6 splitter (NLTK object): sentence segmentation from NLTK library.
		param7 country_language (string): country_language metadata, embedded in file name.

	Returns:
		updated df_questionnaire when new valid question segments are included, or df_questionnaire in the same state it
		was when no new valid question segments were included.
	"""
    print(raw_item)
    index_question_tag = raw_item.index('{QUESTION}')
    index_answer_tag = raw_item.index('{ANSWERS}')

    question_segment = raw_item[index_question_tag + 1:index_answer_tag]

    for item in question_segment:
        item = clean_text(item)
        item = expand_interviewer_abbreviations(item, country_language)
        if item != '':
            sentences = splitter.tokenize(item)
            for sentence in sentences:
                if df_questionnaire.empty:
                    survey_item_id = ut.get_survey_item_id(survey_item_prefix)
                else:
                    survey_item_id = ut.update_survey_item_id(
                        survey_item_prefix)

                if check_if_segment_is_instruction(sentence, country_language):
                    item_type = 'INSTRUCTION'
                else:
                    item_type = 'REQUEST'

                data = {
                    "survey_item_ID": survey_item_id,
                    'Study': study,
                    'module': retrieve_item_module(item_name, study),
                    'item_type': item_type,
                    'item_name': item_name,
                    'item_value': None,
                    'text': sentence
                }
                df_questionnaire = df_questionnaire.append(data,
                                                           ignore_index=True)

    return df_questionnaire
Exemple #9
0
def process_intro_segment(raw_item, survey_item_prefix, study, item_name,
                          df_questionnaire, splitter):
    """
	Extracts and processes the introduction segments from a raw item.
	The introduction segments are always between the item name and {QUESTION} tag, 
	for instance:

	{INTRO}
	Ara m'agradaria fer-li algunes preguntes sobre política i el govern.

	B1 
	{QUESTION}
	En quina mesura diria vostè que l'interessa la política? 
	Vostè diria que l'interessa...

	{ANSWERS}
	Molt
	Bastant 
	Poc 
	Gens 

	Args:
		param1 raw_item (list): raw survey item, retrieved in previous steps.
		param2 survey_item_prefix (string): prefix of survey_item_ID.
		param3 study (string): metadata parameter about study embedded in the file name.
		param4 item_name (string): item_name metadata parameter, retrieved in previous steps.
		param5 df_questionnaire (pandas dataframe): pandas dataframe to store questionnaire data.
		param6 splitter (NLTK object): sentence segmentation from NLTK library.

	Returns:
		updated df_questionnaire when new valid introduction segments are included, or df_questionnaire in 
		the same state it was when no new valid introduction segments were included.
	"""
    index_intro_tag = raw_item.index('{INTRO}')
    index_question_tag = raw_item.index('{QUESTION}')

    intro_segment = raw_item[index_intro_tag + 1:index_question_tag]

    for item in intro_segment:
        item = clean_text(item)
        if item != '':
            sentences = splitter.tokenize(item)
            for sentence in sentences:
                if df_questionnaire.empty:
                    survey_item_id = ut.get_survey_item_id(survey_item_prefix)
                else:
                    survey_item_id = ut.update_survey_item_id(
                        survey_item_prefix)

                data = {
                    "survey_item_ID": survey_item_id,
                    'Study': study,
                    'module': retrieve_item_module(item_name, study),
                    'item_type': 'INTRODUCTION',
                    'item_name': item_name,
                    'item_value': None,
                    'text': sentence
                }
                df_questionnaire = df_questionnaire.append(data,
                                                           ignore_index=True)

    return df_questionnaire