Ejemplo n.º 1
0
 def test_request(self):
     '''Test the online extract function'''
     request_kwargs = {
         'data': json.dumps(annotation_by_task(classification)),
         'content_type': 'application/json'
     }
     app = flask.Flask(__name__)
     append_version(expected)
     if len(kwargs) > 0:
         url_params = '?{0}'.format(urllib.parse.urlencode(kwargs))
     else:
         url_params = ''
     with app.test_request_context(url_params, **request_kwargs):
         result = function(flask.request)
         self.assertTestType(result, expected)
Ejemplo n.º 2
0
 def test_extract(self):
     '''Test the offline extract function'''
     result = function(annotation_by_task(classification), **kwargs)
     append_version(expected)
     self.assertTextExtractor(result, expected)
Ejemplo n.º 3
0
 def test_bad_keyword(self):
     '''Test error is raised if a bad keyword is used for dot_freq'''
     with self.assertRaises(ValueError):
         function(annotation_by_task(classification),
                  dot_freq='bad_keyword')
Ejemplo n.º 4
0
def extract_csv(classification_csv,
                config,
                output_dir=os.path.abspath('.'),
                output_name='extractions',
                order=False,
                verbose=False):
    config = get_file_instance(config)
    with config as config_in:
        config_yaml = yaml.load(config_in, Loader=yaml.SafeLoader)

    extractor_config = config_yaml['extractor_config']
    workflow_id = config_yaml['workflow_id']
    version = config_yaml['workflow_version']

    blank_extracted_data = OrderedDict([('classification_id', []),
                                        ('user_name', []), ('user_id', []),
                                        ('workflow_id', []), ('task', []),
                                        ('created_at', []), ('subject_id', []),
                                        ('extractor', []), ('data', [])])

    extracted_data = {}

    classification_csv = get_file_instance(classification_csv)
    with classification_csv as classification_csv_in:
        classifications = pandas.read_csv(classification_csv_in,
                                          encoding='utf-8',
                                          dtype={'workflow_version': str})

    wdx = classifications.workflow_id == workflow_id
    assert (
        wdx.sum() >
        0), 'There are no classifications matching the configured workflow ID'
    if '.' in version:
        vdx = classifications.workflow_version == version
    else:
        vdx = classifications.workflow_version.apply(
            get_major_version) == version

    assert (
        vdx.sum() > 0
    ), 'There are no classificaitons matching the configured version number'
    assert (
        (vdx & wdx).sum() > 0
    ), 'There are no classifications matching the combined workflow ID and version number'

    widgets = [
        'Extracting: ',
        progressbar.Percentage(), ' ',
        progressbar.Bar(), ' ',
        progressbar.ETA()
    ]
    pbar = progressbar.ProgressBar(widgets=widgets,
                                   max_value=(wdx & vdx).sum())
    counter = 0
    pbar.start()
    for cdx, classification in classifications[wdx & vdx].iterrows():
        classification_by_task = annotation_by_task(
            {'annotations': json.loads(classification.annotations)})
        for extractor_name, keywords in extractor_config.items():
            extractor_key = extractor_name
            if 'shape_extractor' in extractor_name:
                extractor_key = 'shape_extractor'
            for keyword in keywords:
                if extractor_key in extractors.extractors:
                    try:
                        extract = extractors.extractors[extractor_key](
                            copy.deepcopy(classification_by_task), **keyword)
                    except:
                        if verbose:
                            print()
                            print('Incorrectly formatted annotation')
                            print(classification)
                            print(extractor_key)
                            print(classification_by_task)
                            print(keyword)
                        continue
                    if isinstance(extract, list):
                        for e in extract:
                            extracted_data.setdefault(
                                extractor_name,
                                copy.deepcopy(blank_extracted_data))
                            extracted_data[extractor_name][
                                'classification_id'].append(
                                    classification.classification_id)
                            extracted_data[extractor_name]['user_name'].append(
                                classification.user_name)
                            extracted_data[extractor_name]['user_id'].append(
                                classification.user_id)
                            extracted_data[extractor_name][
                                'workflow_id'].append(
                                    classification.workflow_id)
                            extracted_data[extractor_name]['task'].append(
                                keyword['task'])
                            extracted_data[extractor_name][
                                'created_at'].append(classification.created_at)
                            extracted_data[extractor_name][
                                'subject_id'].append(
                                    classification.subject_ids)
                            extracted_data[extractor_name]['extractor'].append(
                                extractor_name)
                            extracted_data[extractor_name]['data'].append(e)
                    else:
                        extracted_data.setdefault(
                            extractor_name,
                            copy.deepcopy(blank_extracted_data))
                        extracted_data[extractor_name][
                            'classification_id'].append(
                                classification.classification_id)
                        extracted_data[extractor_name]['user_name'].append(
                            classification.user_name)
                        extracted_data[extractor_name]['user_id'].append(
                            classification.user_id)
                        extracted_data[extractor_name]['workflow_id'].append(
                            classification.workflow_id)
                        extracted_data[extractor_name]['task'].append(
                            keyword['task'])
                        extracted_data[extractor_name]['created_at'].append(
                            classification.created_at)
                        extracted_data[extractor_name]['subject_id'].append(
                            classification.subject_ids)
                        extracted_data[extractor_name]['extractor'].append(
                            extractor_name)
                        extracted_data[extractor_name]['data'].append(extract)
        counter += 1
        pbar.update(counter)
    pbar.finish()

    # create one flat csv file for each extractor used
    output_base_name, output_ext = os.path.splitext(output_name)
    output_files = []
    for extractor_name, data in extracted_data.items():
        output_path = os.path.join(
            output_dir, '{0}_{1}.csv'.format(extractor_name, output_base_name))
        output_files.append(output_path)
        flat_extract = flatten_data(data)
        if order:
            flat_extract = order_columns(flat_extract, front=['choice'])
        flat_extract.to_csv(output_path, index=False, encoding='utf-8')
    return output_files
Ejemplo n.º 5
0
def extract_csv(
    classification_csv,
    config,
    output_dir=CURRENT_PATH,
    output_name='extractions',
    order=False,
    verbose=False,
    cpu_count=1
):
    config = get_file_instance(config)
    with config as config_in:
        config_yaml = yaml.load(config_in, Loader=yaml.SafeLoader)

    extractor_config = config_yaml['extractor_config']
    workflow_id = config_yaml['workflow_id']
    version = config_yaml['workflow_version']
    number_of_extractors = sum([len(value) for key, value in extractor_config.items()])

    extracted_data = defaultdict(list)

    classification_csv = get_file_instance(classification_csv)
    with classification_csv as classification_csv_in:
        classifications = pandas.read_csv(classification_csv_in, encoding='utf-8', dtype={'workflow_version': str})

    wdx = classifications.workflow_id == workflow_id
    assert (wdx.sum() > 0), 'There are no classifications matching the configured workflow ID'
    if '.' in version:
        vdx = classifications.workflow_version == version
    else:
        vdx = classifications.workflow_version.apply(get_major_version) == version

    assert (vdx.sum() > 0), 'There are no classificaitons matching the configured version number'
    assert ((vdx & wdx).sum() > 0), 'There are no classifications matching the combined workflow ID and version number'

    widgets = [
        'Extracting: ',
        progressbar.Percentage(),
        ' ', progressbar.Bar(),
        ' ', progressbar.ETA()
    ]
    max_pbar = (wdx & vdx).sum() * number_of_extractors
    pbar = progressbar.ProgressBar(widgets=widgets, max_value=max_pbar)
    counter = 0

    def callback(name_with_row):
        nonlocal extracted_data
        nonlocal counter
        nonlocal pbar
        extractor_name, new_extract_row = name_with_row
        if new_extract_row is not None:
            extracted_data[extractor_name] += new_extract_row
        counter += 1
        pbar.update(counter)

    pbar.start()
    if cpu_count > 1:
        pool = Pool(cpu_count)
    for _, classification in classifications[wdx & vdx].iterrows():
        classification_by_task = annotation_by_task({
            'annotations': json.loads(classification.annotations),
            'metadata': json.loads(classification.metadata)
        })
        classification_info = {
            'classification_id': classification.classification_id,
            'user_name': classification.user_name,
            'user_id': classification.user_id,
            'workflow_id': classification.workflow_id,
            'created_at': classification.created_at,
            'subject_ids': classification.subject_ids
        }
        for extractor_name, keywords in extractor_config.items():
            extractor_key = extractor_name
            if 'shape_extractor' in extractor_name:
                extractor_key = 'shape_extractor'
            for keyword in keywords:
                if extractor_key in extractors.extractors:
                    if cpu_count > 1:
                        pool.apply_async(
                            extract_classification,
                            args=(
                                copy.deepcopy(classification_by_task),
                                classification_info,
                                extractor_key,
                                extractor_name,
                                keyword,
                                verbose
                            ),
                            callback=callback
                        )
                    else:
                        name_with_row = extract_classification(
                            copy.deepcopy(classification_by_task),
                            classification_info,
                            extractor_key,
                            extractor_name,
                            keyword,
                            verbose
                        )
                        callback(name_with_row)
                else:
                    callback((None, None))
    if cpu_count > 1:
        pool.close()
        pool.join()
    pbar.finish()

    # create one flat csv file for each extractor used
    output_base_name, _ = os.path.splitext(output_name)
    output_files = []
    for extractor_name, data in extracted_data.items():
        output_path = os.path.join(output_dir, '{0}_{1}.csv'.format(extractor_name, output_base_name))
        output_files.append(output_path)
        non_flat_extract = pandas.DataFrame(data)
        flat_extract = flatten_data(non_flat_extract)
        if order:
            flat_extract = order_columns(flat_extract, front=['choice'])
        flat_extract.to_csv(output_path, index=False, encoding='utf-8')
    return output_files
Ejemplo n.º 6
0
 def test_bad_keyword(self):
     '''Test error is raised if a bad keyword is used for shape'''
     with self.assertRaises(KeyError):
         extractors.shape_extractor(annotation_by_task(classification),
                                    shape='bad_shape')
Ejemplo n.º 7
0
 def test_missing_frame(self):
     extractors.shape_extractor(
         annotation_by_task(data_subject_classification), shape="column")