def _get_multi_choice_data( self, question_metadata: QuestionMetadata ) -> Series: # merge multi-choice questions to single column level_0 = self.survey_data.columns.get_level_values(0).to_list() level_1 = self.survey_data.columns.get_level_values(1).to_list() if question_metadata.expression is None: match_cols = [(l0, l1) for l0, l1 in zip(level_0, level_1) if l0 == question_metadata.text] else: match_cols = [(l0, l1) for l0, l1 in zip(level_0, level_1) if match(question_metadata.expression, l0)] if len(match_cols) == 0: if question_metadata.expression is None: raise ValueError( f'Could not find any columns matching ' f'"{question_metadata.text}"' ) else: raise ValueError( f'Could not match expression ' f'"{question_metadata.expression}" ' f'for MultiChoice question "{question_metadata}"' ) return Series( name=match_cols[0][0], index=self.survey_data.index, data=self.survey_data[match_cols].apply( lambda row: CATEGORY_SPLITTER.join( row.dropna().astype(str) ) if len(row.dropna()) else nan, axis=1) )
def replace_item(item: str): if isnull(item): return nan items = item.split(CATEGORY_SPLITTER) items = [ replacements[item] if item in replacements.keys() else item for item in items ] return CATEGORY_SPLITTER.join(items)
def rank_choices(value: str): choices_ranks = value.split(' | ') choices = [rank_choice.split(':')[0] for rank_choice in choices_ranks] ranks = [int(rank_choice.split(':')[1]) for rank_choice in choices_ranks] ranked_choices = [choice for rank, choice in sorted(zip(ranks, choices))] return CATEGORY_SPLITTER.join( str(choice) for choice in ranked_choices )
def get_selected(item: str): if isnull(item): return nan selected = [] while len(item) > 0: for category in categories: if item == category: selected.append(category) item = '' elif item[: len(category) + 2] == category + '; ': selected.append(category) item = item[len(category) + 2:] return CATEGORY_SPLITTER.join(selected)
def _get_multi_choice_data(self, question_metadata: QuestionMetadata) -> Series: # merge multi-choice questions to single column if question_metadata.expression is None: raise ValueError('Need a regular expression to match ' 'MultiChoice question columns.') match_cols = [ c for c in self.survey_data.columns if match(question_metadata.expression, c) ] if len(match_cols) == 0: raise ValueError( f'Could not match expression "{question_metadata.expression}" ' f'for MultiChoice question "{question_metadata}"') return self.survey_data[match_cols].apply( lambda row: CATEGORY_SPLITTER.join(row.dropna().astype(str)), axis=1)
def _get_multi_choice_data( self, question_metadata: QuestionMetadata ) -> Series: # get data data: DataFrame = self.survey_data[question_metadata.text] # replace other values if question_metadata.text in self.questions_metadata['text'].to_list(): # this if condition excludes repeated questions that have had their # text changed def replace_other(value): if isnull(value): return value else: if value in categories: return value else: return 'Other' metadata: Series = self.questions_metadata.loc[ self.questions_metadata['text'] == question_metadata.text ].iloc[0] if 'other' in metadata.keys() and metadata['other'] == True: # do replacement category_name = metadata['categories'] categories = self.orders_metadata.loc[ self.orders_metadata['category'] == category_name, 'value' ].to_list() data = data.applymap(replace_other) # update categories in orders_metadata self.orders_metadata = self.orders_metadata.append( Series({'category': question_metadata.name, 'value': 'Other'}), ignore_index=True ) # merge multi-choice questions to single column return data.apply( lambda row: CATEGORY_SPLITTER.join(row.dropna().astype(str)), axis=1 )
def _get_ranked_choice_data(self, question_metadata: QuestionMetadata) -> list: # merge ranked choice questions to single column if question_metadata.expression is None: raise ValueError('Need a regular expression to match ' 'RankedChoice question columns.') match_cols = [ c for c in self.survey_data.columns if match(question_metadata.expression, c) ] choices = [self._get_ranked_choice_name(c) for c in match_cols] # create column in new dataframe new_answers = [] for _, row in self.survey_data[match_cols].iterrows(): # order choices by rank ranks = row.tolist() ranked_choices = [ choice for rank, choice in sorted(zip(ranks, choices)) ] ranked_choices = CATEGORY_SPLITTER.join( str(choice) for choice in ranked_choices) new_answers.append(ranked_choices) return new_answers
def stack(self, name: str, drop_na: bool = True, null_category: Optional[str] = None, name_index: Optional[str] = None, key_index: Optional[str] = None, number_index: Optional[str] = None, **kwargs): """ Stack the responses to each question in the group into a new question. :param name: Name for the new question. :param drop_na: For MultiChoiceQuestions, whether to drop rows where the respondent was not asked the question. :param null_category: For MultiChoiceQuestions, optional response to exclude from the new question group. :param name_index: Name of a new index column to create with values corresponding to the name of the question the data comes from. :param key_index: Name of a new index column to create with values corresponding to the question's key in the group. :param number_index: Name of a new index column to create with values corresponding to the number of the question the data comes from. :param kwargs: Optional new attribute values to override in the new question. """ if name == '': raise ValueError('Name cannot be empty.') # create index names index_names = [self._questions[0].data.index.name] if name_index is not None: index_names.append(name_index) if key_index is not None: index_names.append(key_index) if number_index is not None: index_names.append(number_index) question_datas = [] question: MultiChoiceQuestion for question in self._questions: # create data question_data = question.make_features( naming='{{choice}}', drop_na=drop_na, null_category=null_category ) # create index index_list = question_data.index.to_list() if name_index is not None: name_list = [question.name] * len(question_data) else: name_list = None if key_index is not None: key_list = [ [k for k in self._item_dict.keys() if self._item_dict[k] == question][0] ] * len(question_data) else: key_list = None if number_index is not None: number_list = [ self._questions.index(question) ] * len(question_data) else: number_list = None if name_list is None and key_list is None and number_list is None: question_data.index = Index(data=index_list, name=index_names[0]) else: index_tuples = list(zip(*[ ix_list for ix_list in [index_list, name_list, key_list, number_list] if ix_list is not None ])) question_data.index = MultiIndex.from_tuples( tuples=index_tuples, names=index_names ) question_datas.append(question_data) new_data = concat(question_datas, axis=0) new_data = DataFrame({ column: column_data.replace({1: column, 0: ''}) for column, column_data in new_data.iteritems() }) new_data = Series( data=[ CATEGORY_SPLITTER.join(row.replace('', nan).dropna().to_list()) for _, row in new_data.iterrows() ], index=new_data.index, name=name ) # copy question new_question = copy(self._questions[0]) new_question.name = name new_question._data = new_data if isinstance(new_question, MultiChoiceQuestion): if null_category is not None: new_question._categories = [ c for c in new_question._categories if c != null_category ] for kw, arg in kwargs.items(): setattr(new_question, kw, arg) return new_question