def extract_csv(classification_csv, config, output_dir=os.path.abspath('.'), output_name='extractions', order=False, verbose=False): config = get_file_instance(config) with config as config_in: config_yaml = yaml.load(config_in, Loader=yaml.SafeLoader) extractor_config = config_yaml['extractor_config'] workflow_id = config_yaml['workflow_id'] version = config_yaml['workflow_version'] blank_extracted_data = OrderedDict([('classification_id', []), ('user_name', []), ('user_id', []), ('workflow_id', []), ('task', []), ('created_at', []), ('subject_id', []), ('extractor', []), ('data', [])]) extracted_data = {} classification_csv = get_file_instance(classification_csv) with classification_csv as classification_csv_in: classifications = pandas.read_csv(classification_csv_in, encoding='utf-8', dtype={'workflow_version': str}) wdx = classifications.workflow_id == workflow_id assert ( wdx.sum() > 0), 'There are no classifications matching the configured workflow ID' if '.' in version: vdx = classifications.workflow_version == version else: vdx = classifications.workflow_version.apply( get_major_version) == version assert ( vdx.sum() > 0 ), 'There are no classificaitons matching the configured version number' assert ( (vdx & wdx).sum() > 0 ), 'There are no classifications matching the combined workflow ID and version number' widgets = [ 'Extracting: ', progressbar.Percentage(), ' ', progressbar.Bar(), ' ', progressbar.ETA() ] pbar = progressbar.ProgressBar(widgets=widgets, max_value=(wdx & vdx).sum()) counter = 0 pbar.start() for cdx, classification in classifications[wdx & vdx].iterrows(): classification_by_task = annotation_by_task( {'annotations': json.loads(classification.annotations)}) for extractor_name, keywords in extractor_config.items(): extractor_key = extractor_name if 'shape_extractor' in extractor_name: extractor_key = 'shape_extractor' for keyword in keywords: if extractor_key in extractors.extractors: try: extract = extractors.extractors[extractor_key]( copy.deepcopy(classification_by_task), **keyword) except: if verbose: print() print('Incorrectly formatted annotation') print(classification) print(extractor_key) print(classification_by_task) print(keyword) continue if isinstance(extract, list): for e in extract: extracted_data.setdefault( extractor_name, copy.deepcopy(blank_extracted_data)) extracted_data[extractor_name][ 'classification_id'].append( classification.classification_id) extracted_data[extractor_name]['user_name'].append( classification.user_name) extracted_data[extractor_name]['user_id'].append( classification.user_id) extracted_data[extractor_name][ 'workflow_id'].append( classification.workflow_id) extracted_data[extractor_name]['task'].append( keyword['task']) extracted_data[extractor_name][ 'created_at'].append(classification.created_at) extracted_data[extractor_name][ 'subject_id'].append( classification.subject_ids) extracted_data[extractor_name]['extractor'].append( extractor_name) extracted_data[extractor_name]['data'].append(e) else: extracted_data.setdefault( extractor_name, copy.deepcopy(blank_extracted_data)) extracted_data[extractor_name][ 'classification_id'].append( classification.classification_id) extracted_data[extractor_name]['user_name'].append( classification.user_name) extracted_data[extractor_name]['user_id'].append( classification.user_id) extracted_data[extractor_name]['workflow_id'].append( classification.workflow_id) extracted_data[extractor_name]['task'].append( keyword['task']) extracted_data[extractor_name]['created_at'].append( classification.created_at) extracted_data[extractor_name]['subject_id'].append( classification.subject_ids) extracted_data[extractor_name]['extractor'].append( extractor_name) extracted_data[extractor_name]['data'].append(extract) counter += 1 pbar.update(counter) pbar.finish() # create one flat csv file for each extractor used output_base_name, output_ext = os.path.splitext(output_name) output_files = [] for extractor_name, data in extracted_data.items(): output_path = os.path.join( output_dir, '{0}_{1}.csv'.format(extractor_name, output_base_name)) output_files.append(output_path) flat_extract = flatten_data(data) if order: flat_extract = order_columns(flat_extract, front=['choice']) flat_extract.to_csv(output_path, index=False, encoding='utf-8') return output_files
def reduce_csv( extracted_csv, reducer_config, filter='first', output_name='reductions', output_dir=CURRENT_PATH, order=False, stream=False, cpu_count=1 ): extracted_csv = get_file_instance(extracted_csv) with extracted_csv as extracted_csv_in: extracted = pandas.read_csv( extracted_csv_in, infer_datetime_format=True, parse_dates=['created_at'], encoding='utf-8' ) extracted.sort_values(['subject_id', 'created_at'], inplace=True) resume = False subjects = extracted.subject_id.unique() tasks = extracted.task.unique() workflow_id = extracted.workflow_id.iloc[0] reducer_config = get_file_instance(reducer_config) with reducer_config as config: config_yaml = yaml.load(config, Loader=yaml.SafeLoader) assert (len(config_yaml['reducer_config']) == 1), 'There must be only one reducer in the config file.' for key, value in config_yaml['reducer_config'].items(): reducer_name = key keywords = value assert (reducer_name in reducers.reducers), 'The reducer in the config files does not exist.' output_base_name, _ = os.path.splitext(output_name) output_path = os.path.join(output_dir, '{0}_{1}.csv'.format(reducer_name, output_base_name)) if stream: if os.path.isfile(output_path): print('resuming from last run') resume = True with open(output_path, 'r', encoding='utf-8') as reduced_file: reduced_csv = pandas.read_csv(reduced_file, encoding='utf-8') subjects = np.setdiff1d(subjects, reduced_csv.subject_id) reduced_data = [] sdx = 0 apply_keywords = { 'reducer_name': reducer_name, 'workflow_id': workflow_id, 'filter': filter, 'keywords': keywords } widgets = [ 'Reducing: ', progressbar.Percentage(), ' ', progressbar.Bar(), ' ', progressbar.ETA() ] number_of_rows = len(subjects) * len(tasks) pbar = progressbar.ProgressBar(widgets=widgets, max_value=number_of_rows) def callback(reduced_data_list): nonlocal reduced_data nonlocal sdx nonlocal pbar nonlocal stream reduced_data += reduced_data_list if stream: if (sdx == 0) and (not resume): pandas.DataFrame(reduced_data).to_csv( output_path, mode='w', index=False, encoding='utf-8' ) else: pandas.DataFrame(reduced_data).to_csv( output_path, mode='a', index=False, header=False, encoding='utf-8' ) reduced_data.clear() sdx += 1 pbar.update(sdx) pbar.start() if cpu_count > 1: pool = Pool(cpu_count) for subject in subjects: idx = extracted.subject_id == subject for task in tasks: jdx = extracted.task == task classifications = extracted[idx & jdx] if cpu_count > 1: pool.apply_async( reduce_subject, args=( subject, classifications, task ), kwds=apply_keywords, callback=callback ) else: reduced_data_list = reduce_subject( subject, classifications, task, **apply_keywords ) callback(reduced_data_list) if cpu_count > 1: pool.close() pool.join() pbar.finish() if stream: reduced_csv = pandas.read_csv(output_path, encoding='utf-8') if 'data' in reduced_csv: def eval_func(a): # pandas uses a local namespace, make sure it has the correct imports from collections import OrderedDict # noqa from numpy import nan # noqa return eval(a) reduced_csv.data = reduced_csv.data.apply(eval_func) flat_reduced_data = flatten_data(reduced_csv) else: return output_path else: non_flat_data = pandas.DataFrame(reduced_data) flat_reduced_data = flatten_data(non_flat_data) if order: flat_reduced_data = order_columns(flat_reduced_data, front=['choice', 'total_vote_count', 'choice_count']) flat_reduced_data.to_csv(output_path, index=False, encoding='utf-8') return output_path
def test_order_columns(self): '''Test order columns''' result = csv_utils.order_columns(unordered_data, front=['choice']) assert_frame_equal(result, ordered_data)
def extract_csv( classification_csv, config, output_dir=CURRENT_PATH, output_name='extractions', order=False, verbose=False, cpu_count=1 ): config = get_file_instance(config) with config as config_in: config_yaml = yaml.load(config_in, Loader=yaml.SafeLoader) extractor_config = config_yaml['extractor_config'] workflow_id = config_yaml['workflow_id'] version = config_yaml['workflow_version'] number_of_extractors = sum([len(value) for key, value in extractor_config.items()]) extracted_data = defaultdict(list) classification_csv = get_file_instance(classification_csv) with classification_csv as classification_csv_in: classifications = pandas.read_csv(classification_csv_in, encoding='utf-8', dtype={'workflow_version': str}) wdx = classifications.workflow_id == workflow_id assert (wdx.sum() > 0), 'There are no classifications matching the configured workflow ID' if '.' in version: vdx = classifications.workflow_version == version else: vdx = classifications.workflow_version.apply(get_major_version) == version assert (vdx.sum() > 0), 'There are no classificaitons matching the configured version number' assert ((vdx & wdx).sum() > 0), 'There are no classifications matching the combined workflow ID and version number' widgets = [ 'Extracting: ', progressbar.Percentage(), ' ', progressbar.Bar(), ' ', progressbar.ETA() ] max_pbar = (wdx & vdx).sum() * number_of_extractors pbar = progressbar.ProgressBar(widgets=widgets, max_value=max_pbar) counter = 0 def callback(name_with_row): nonlocal extracted_data nonlocal counter nonlocal pbar extractor_name, new_extract_row = name_with_row if new_extract_row is not None: extracted_data[extractor_name] += new_extract_row counter += 1 pbar.update(counter) pbar.start() if cpu_count > 1: pool = Pool(cpu_count) for _, classification in classifications[wdx & vdx].iterrows(): classification_by_task = annotation_by_task({ 'annotations': json.loads(classification.annotations), 'metadata': json.loads(classification.metadata) }) classification_info = { 'classification_id': classification.classification_id, 'user_name': classification.user_name, 'user_id': classification.user_id, 'workflow_id': classification.workflow_id, 'created_at': classification.created_at, 'subject_ids': classification.subject_ids } for extractor_name, keywords in extractor_config.items(): extractor_key = extractor_name if 'shape_extractor' in extractor_name: extractor_key = 'shape_extractor' for keyword in keywords: if extractor_key in extractors.extractors: if cpu_count > 1: pool.apply_async( extract_classification, args=( copy.deepcopy(classification_by_task), classification_info, extractor_key, extractor_name, keyword, verbose ), callback=callback ) else: name_with_row = extract_classification( copy.deepcopy(classification_by_task), classification_info, extractor_key, extractor_name, keyword, verbose ) callback(name_with_row) else: callback((None, None)) if cpu_count > 1: pool.close() pool.join() pbar.finish() # create one flat csv file for each extractor used output_base_name, _ = os.path.splitext(output_name) output_files = [] for extractor_name, data in extracted_data.items(): output_path = os.path.join(output_dir, '{0}_{1}.csv'.format(extractor_name, output_base_name)) output_files.append(output_path) non_flat_extract = pandas.DataFrame(data) flat_extract = flatten_data(non_flat_extract) if order: flat_extract = order_columns(flat_extract, front=['choice']) flat_extract.to_csv(output_path, index=False, encoding='utf-8') return output_files
def reduce_csv(extracted_csv, reducer_config, filter='first', output_name='reductions', output_dir=os.path.abspath('.'), order=False, stream=False): extracted_csv = get_file_instance(extracted_csv) with extracted_csv as extracted_csv_in: extracted = pandas.read_csv(extracted_csv_in, infer_datetime_format=True, parse_dates=['created_at'], encoding='utf-8') extracted.sort_values(['subject_id', 'created_at'], inplace=True) resume = False subjects = extracted.subject_id.unique() tasks = extracted.task.unique() workflow_id = extracted.workflow_id.iloc[0] reducer_config = get_file_instance(reducer_config) with reducer_config as config: config_yaml = yaml.load(config, Loader=yaml.SafeLoader) assert (len(config_yaml['reducer_config']) == 1 ), 'There must be only one reducer in the config file.' for key, value in config_yaml['reducer_config'].items(): reducer_name = key keywords = value assert (reducer_name in reducers.reducers ), 'The reducer in the config files does not exist.' output_base_name, output_ext = os.path.splitext(output_name) output_path = os.path.join( output_dir, '{0}_{1}.csv'.format(reducer_name, output_base_name)) if stream: if os.path.isfile(output_path): print('resuming from last run') resume = True with open(output_path, 'r', encoding='utf-8') as reduced_file: reduced_csv = pandas.read_csv(reduced_file, encoding='utf-8') subjects = np.setdiff1d(subjects, reduced_csv.subject_id) blank_reduced_data = OrderedDict([('subject_id', []), ('workflow_id', []), ('task', []), ('reducer', []), ('data', [])]) reduced_data = copy.deepcopy(blank_reduced_data) widgets = [ 'Reducing: ', progressbar.Percentage(), ' ', progressbar.Bar(), ' ', progressbar.ETA() ] pbar = progressbar.ProgressBar(widgets=widgets, max_value=len(subjects)) pbar.start() for sdx, subject in enumerate(subjects): idx = extracted.subject_id == subject for task in tasks: jdx = extracted.task == task classifications = extracted[idx & jdx] classifications = classifications.drop_duplicates() if filter in FILTER_TYPES: classifications = classifications.groupby( ['user_name'], group_keys=False).apply(FILTER_TYPES[filter]) data = [unflatten_data(c) for cdx, c in classifications.iterrows()] reduction = reducers.reducers[reducer_name](data, **keywords) if isinstance(reduction, list): for r in reduction: reduced_data['subject_id'].append(subject) reduced_data['workflow_id'].append(workflow_id) reduced_data['task'].append(task) reduced_data['reducer'].append(reducer_name) reduced_data['data'].append(r) else: reduced_data['subject_id'].append(subject) reduced_data['workflow_id'].append(workflow_id) reduced_data['task'].append(task) reduced_data['reducer'].append(reducer_name) reduced_data['data'].append(reduction) if stream: if (sdx == 0) and (not resume): pandas.DataFrame(reduced_data).to_csv(output_path, mode='w', index=False, encoding='utf-8') else: pandas.DataFrame(reduced_data).to_csv(output_path, mode='a', index=False, header=False, encoding='utf-8') reduced_data = copy.deepcopy(blank_reduced_data) pbar.update(sdx + 1) pbar.finish() if stream: reduced_csv = pandas.read_csv(output_path, encoding='utf-8') if 'data' in reduced_csv: reduced_csv.data = reduced_csv.data.apply(eval) flat_reduced_data = flatten_data(reduced_csv) else: return output_path else: flat_reduced_data = flatten_data(reduced_data) if order: flat_reduced_data = order_columns( flat_reduced_data, front=['choice', 'total_vote_count', 'choice_count']) flat_reduced_data.to_csv(output_path, index=False, encoding='utf-8') return output_path