def test_prune_by_work_count(self): input_data = (['AB', '2', 'a', 'base', '4', 'A'], ['AB', '2', 'b', 'base', '7', 'A'], ['AB', '2', 'c', 'base', '1', 'B'], ['AB', '2', 'd', 'base', '3', 'B'], ['ABC', '3', 'a', 'base', '3', 'A'], ['ABC', '3', 'b', 'base', '5', 'A'], ['ABC', '3', 'c', 'base', '1', 'B'], ['BA', '2', 'a', 'base', '6', 'A'], ['B', '1', 'a', 'base', '5', 'A'], ['B', '1', 'b', 'base', '3', 'A'], ['B', '1', 'b', 'wit', '3', 'A'], ['B', '1', 'c', 'base', '0', 'B']) fh = self._create_csv(input_data) results = tacl.Results(fh, self._tokenizer) results.prune_by_work_count(minimum=3) expected_rows = [ tacl.constants.QUERY_FIELDNAMES, ('AB', '2', 'a', 'base', '4', 'A'), ('AB', '2', 'b', 'base', '7', 'A'), ('AB', '2', 'c', 'base', '1', 'B'), ('AB', '2', 'd', 'base', '3', 'B'), ('ABC', '3', 'a', 'base', '3', 'A'), ('ABC', '3', 'b', 'base', '5', 'A'), ('ABC', '3', 'c', 'base', '1', 'B') ] actual_rows = self._get_rows_from_results(results) self.assertEqual(actual_rows, expected_rows) fh.seek(0) results = tacl.Results(fh, self._tokenizer) results.prune_by_work_count(maximum=3) expected_rows = [ tacl.constants.QUERY_FIELDNAMES, ('ABC', '3', 'a', 'base', '3', 'A'), ('ABC', '3', 'b', 'base', '5', 'A'), ('ABC', '3', 'c', 'base', '1', 'B'), ('BA', '2', 'a', 'base', '6', 'A'), ('B', '1', 'a', 'base', '5', 'A'), ('B', '1', 'b', 'base', '3', 'A'), ('B', '1', 'b', 'wit', '3', 'A'), ('B', '1', 'c', 'base', '0', 'B'), ] actual_rows = self._get_rows_from_results(results) self.assertEqual(actual_rows, expected_rows) fh.seek(0) results = tacl.Results(fh, self._tokenizer) results.prune_by_work_count(minimum=2, maximum=3) expected_rows = [ tacl.constants.QUERY_FIELDNAMES, ('ABC', '3', 'a', 'base', '3', 'A'), ('ABC', '3', 'b', 'base', '5', 'A'), ('ABC', '3', 'c', 'base', '1', 'B'), ('B', '1', 'a', 'base', '5', 'A'), ('B', '1', 'b', 'base', '3', 'A'), ('B', '1', 'b', 'wit', '3', 'A'), ('B', '1', 'c', 'base', '0', 'B') ] actual_rows = self._get_rows_from_results(results) self.assertEqual(actual_rows, expected_rows)
def test_sort(self): input_data = (['AB', '2', 'a', 'base', '4', 'A'], ['AB', '2', 'a', 'wit', '3', 'A'], ['ABC', '3', 'a', 'base', '2', 'A'], ['ABD', '3', 'a', 'base', '1', 'B'], ['ABCD', '4', 'a', 'base', '2', 'B'], ['AB', '2', 'b', 'base', '2', 'AB'], ['AB', '2', 'b', 'a', '2', 'AB'], ['ABC', '3', 'b', 'base', '2', 'AB'], ['ABC', '3', 'c', 'base', '3', 'A']) fh = self._create_csv(input_data) results = tacl.Results(fh, self._tokenizer) results.sort() expected_rows = [ tacl.constants.QUERY_FIELDNAMES, ('ABCD', '4', 'a', 'base', '2', 'B'), ('ABC', '3', 'c', 'base', '3', 'A'), ('ABC', '3', 'a', 'base', '2', 'A'), ('ABC', '3', 'b', 'base', '2', 'AB'), ('ABD', '3', 'a', 'base', '1', 'B'), ('AB', '2', 'a', 'base', '4', 'A'), ('AB', '2', 'a', 'wit', '3', 'A'), ('AB', '2', 'b', 'a', '2', 'AB'), ('AB', '2', 'b', 'base', '2', 'AB') ] actual_rows = self._get_rows_from_results(results) self.assertEqual(actual_rows, expected_rows)
def _test_empty_results(self, cmd, fieldnames, *args, **kwargs): fh = self._create_csv([]) results = tacl.Results(fh, self._tokenizer) getattr(results, cmd)(*args, **kwargs) expected_rows = [fieldnames] actual_rows = self._get_rows_from_results(results) self.assertEqual(actual_rows, expected_rows)
def test_group_by_witness(self): input_results = ( ['AB', '2', 'T1', 'wit1', '4', 'A'], ['AB', '2', 'T1', 'wit2', '3', 'A'], ['AB', '2', 'T2', 'wit1', '2', 'A'], ['ABC', '3', 'T1', 'wit1', '2', 'A'], ['ABC', '3', 'T1', 'wit2', '0', 'A'], ['AB', '2', 'T3', 'wit1', '2', 'B'], ['BC', '2', 'T1', 'wit1', '3', 'A'], ) fh = self._create_csv(input_results) results = tacl.Results(fh, self._tokenizer) results.group_by_witness() fieldnames = (tacl.constants.WORK_FIELDNAME, tacl.constants.SIGLUM_FIELDNAME, tacl.constants.LABEL_FIELDNAME, tacl.constants.NGRAMS_FIELDNAME, tacl.constants.NUMBER_FIELDNAME, tacl.constants.TOTAL_COUNT_FIELDNAME) expected_rows = [ fieldnames, ('T1', 'wit1', 'A', 'AB, ABC, BC', '3', '9'), ('T1', 'wit2', 'A', 'AB', '1', '3'), ('T2', 'wit1', 'A', 'AB', '1', '2'), ('T3', 'wit1', 'B', 'AB', '1', '2'), ] actual_rows = self._get_rows_from_results(results) self.assertEqual(set(actual_rows), set(expected_rows))
def test_is_intersect_results(self): # Test that _is_intersect_results correctly identifies diff # and intersect results. intersect_results = (['AB', '2', 'a', 'base', '7', 'A'], ['AB', '2', 'b', 'base', '2', 'B'], ['AB', '2', 'c', 'base', '5', 'C']) fh = self._create_csv(intersect_results) results = tacl.Results(fh, self._tokenizer) self.assertTrue(results._is_intersect_results(results._matches)) diff_results = (['AB', '2', 'a', 'base', '7', 'A'], ['AB', '2', 'a', 'other', '1', 'A'], ['AB', '2', 'b', 'base', '5', 'A'], ['BA', '2', 'c', 'base', '2', 'B']) fh = self._create_csv(diff_results) results = tacl.Results(fh, self._tokenizer) self.assertFalse(results._is_intersect_results(results._matches))
def test_collapse_witnesses(self): input_data = ( ['AB', '2', 'a', 'base', '4', 'A'], ['AB', '2', 'a', 'wit 1', '4', 'A'], ['AB', '2', 'a', 'wit 2', '3', 'A'], ['AB', '2', 'b', 'base', '4', 'A'], ['AB', '2', 'b', 'wit 1', '3', 'A'], ['BC', '2', 'a', 'base', '4', 'A'], ['BC', '2', 'a', 'wit 1', '3', 'A'], ['BC', '2', 'a', 'wit 2', '3', 'A'], ) fh = self._create_csv(input_data) results = tacl.Results(fh, self._tokenizer) results.collapse_witnesses() fieldnames = (tacl.constants.NGRAM_FIELDNAME, tacl.constants.SIZE_FIELDNAME, tacl.constants.WORK_FIELDNAME, tacl.constants.SIGLA_FIELDNAME, tacl.constants.COUNT_FIELDNAME, tacl.constants.LABEL_FIELDNAME) expected_rows = [ fieldnames, ('AB', '2', 'a', 'base, wit 1', '4', 'A'), ('AB', '2', 'a', 'wit 2', '3', 'A'), ('AB', '2', 'b', 'base', '4', 'A'), ('AB', '2', 'b', 'wit 1', '3', 'A'), ('BC', '2', 'a', 'base', '4', 'A'), ('BC', '2', 'a', 'wit 1, wit 2', '3', 'A'), ] actual_rows = self._get_rows_from_results(results) self.assertEqual(actual_rows, expected_rows)
def test_group_by_ngram(self): input_results = ( ['AB', '2', 'T1', 'wit1', '4', 'A'], ['AB', '2', 'T1', 'wit2', '3', 'A'], ['AB', '2', 'T2', 'wit1', '2', 'A'], ['ABC', '3', 'T1', 'wit1', '2', 'A'], ['ABC', '3', 'T1', 'wit2', '0', 'A'], ['AB', '2', 'T3', 'wit1', '2', 'B'], ['AB', '2', 'T4', 'wit1', '1', 'B'], ) fh = self._create_csv(input_results) results = tacl.Results(fh, self._tokenizer) results.group_by_ngram(['B', 'A']) fieldnames = (tacl.constants.NGRAM_FIELDNAME, tacl.constants.SIZE_FIELDNAME, tacl.constants.LABEL_FIELDNAME, tacl.constants.WORK_COUNTS_FIELDNAME) expected_rows = [ fieldnames, ('AB', '2', 'B', 'T3(2), T4(1)'), ('AB', '2', 'A', 'T1(3-4), T2(2)'), ('ABC', '3', 'A', 'T1(0-2)'), ] actual_rows = self._get_rows_from_results(results) self.assertEqual(actual_rows, expected_rows)
def test_add_label_work_count(self): input_data = (['AB', '2', 'a', 'base', '4', 'A'], ['AB', '2', 'a', 'wit1', '2', 'A'], ['AB', '2', 'b', 'base', '1', 'A'], ['AB', '2', 'c', 'base', '2', 'B'], ['BC', '2', 'a', 'base', '0', 'A'], ['BC', '2', 'a', 'wit1', '0', 'A'], ['CD', '2', 'a', 'base', '1', 'A']) fh = self._create_csv(input_data) results = tacl.Results(fh, self._tokenizer) results.add_label_work_count() fieldnames = tuple( list(tacl.constants.QUERY_FIELDNAMES) + [tacl.constants.LABEL_WORK_COUNT_FIELDNAME]) expected_rows = [ fieldnames, ('AB', '2', 'a', 'base', '4', 'A', '2'), ('AB', '2', 'a', 'wit1', '2', 'A', '2'), ('AB', '2', 'b', 'base', '1', 'A', '2'), ('AB', '2', 'c', 'base', '2', 'B', '1'), ('BC', '2', 'a', 'base', '0', 'A', '0'), ('BC', '2', 'a', 'wit1', '0', 'A', '0'), ('CD', '2', 'a', 'base', '1', 'A', '1') ] actual_rows = self._get_rows_from_results(results) self.assertEqual(actual_rows, expected_rows)
def results(args, parser): if args.results == '-': results_fh = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8', newline='') else: results_fh = open(args.results, 'r', encoding='utf-8', newline='') tokenizer = utils.get_tokenizer(args) results = tacl.Results(results_fh, tokenizer) if args.extend: corpus = tacl.Corpus(args.extend, tokenizer) results.extend(corpus) if args.bifurcated_extend: if not args.bifurcated_extend_size: parser.error('The bifurcated extend option requires that the ' '--max-be-count option also be supplied') corpus = tacl.Corpus(args.bifurcated_extend, tokenizer) results.bifurcated_extend(corpus, args.bifurcated_extend_size) if args.reduce: results.reduce() if args.reciprocal: results.reciprocal_remove() if args.excise: results.excise(args.excise) if args.zero_fill: corpus = tacl.Corpus(args.zero_fill, tokenizer) results.zero_fill(corpus) if args.ngrams: with open(args.ngrams, encoding='utf-8') as fh: ngrams = fh.read().split() results.prune_by_ngram(ngrams) if args.min_works or args.max_works: results.prune_by_work_count(args.min_works, args.max_works) if args.min_size or args.max_size: results.prune_by_ngram_size(args.min_size, args.max_size) if args.min_count or args.max_count: results.prune_by_ngram_count(args.min_count, args.max_count) if args.min_count_work or args.max_count_work: results.prune_by_ngram_count_per_work(args.min_count_work, args.max_count_work) if args.remove: results.remove_label(args.remove) if args.sort: results.sort() # Run format-changing operations last. if args.add_label_count: results.add_label_count() if args.add_label_work_count: results.add_label_work_count() if args.group_by_ngram: catalogue = tacl.Catalogue() catalogue.load(args.group_by_ngram) results.group_by_ngram(catalogue.ordered_labels) if args.group_by_witness: results.group_by_witness() if args.collapse_witnesses: results.collapse_witnesses() results.csv(sys.stdout)
def lifetime_report(args, parser): """Generates a lifetime report.""" catalogue = utils.get_catalogue(args) tokenizer = utils.get_tokenizer(args) results = tacl.Results(args.results, tokenizer) output_dir = os.path.abspath(args.output) os.makedirs(output_dir, exist_ok=True) report = tacl.LifetimeReport() report.generate(output_dir, catalogue, results, args.label)
def test_prune_by_ngram_size(self): input_data = (['AB', '2', 'a', 'base', '4', 'A'], ['ABC', '3', 'a', 'base', '2', 'A'], ['ABD', '3', 'a', 'wit', '1', 'A'], ['ABCD', '4', 'a', 'base', '2', 'A'], ['AB', '2', 'b', 'base', '2', 'A'], ['ABC', '3', 'b', 'wit', '2', 'A']) fh = self._create_csv(input_data) results = tacl.Results(fh, self._tokenizer) results.prune_by_ngram_size(minimum=3) expected_rows = [ tacl.constants.QUERY_FIELDNAMES, ('ABC', '3', 'a', 'base', '2', 'A'), ('ABD', '3', 'a', 'wit', '1', 'A'), ('ABCD', '4', 'a', 'base', '2', 'A'), ('ABC', '3', 'b', 'wit', '2', 'A') ] actual_rows = self._get_rows_from_results(results) self.assertEqual(actual_rows, expected_rows) fh.seek(0) results = tacl.Results(fh, self._tokenizer) results.prune_by_ngram_size(maximum=3) expected_rows = [ tacl.constants.QUERY_FIELDNAMES, ('AB', '2', 'a', 'base', '4', 'A'), ('ABC', '3', 'a', 'base', '2', 'A'), ('ABD', '3', 'a', 'wit', '1', 'A'), ('AB', '2', 'b', 'base', '2', 'A'), ('ABC', '3', 'b', 'wit', '2', 'A') ] actual_rows = self._get_rows_from_results(results) self.assertEqual(actual_rows, expected_rows) fh.seek(0) results = tacl.Results(fh, self._tokenizer) results.prune_by_ngram_size(minimum=3, maximum=3) expected_rows = [ tacl.constants.QUERY_FIELDNAMES, ('ABC', '3', 'a', 'base', '2', 'A'), ('ABD', '3', 'a', 'wit', '1', 'A'), ('ABC', '3', 'b', 'wit', '2', 'A') ] actual_rows = self._get_rows_from_results(results) self.assertEqual(actual_rows, expected_rows)
def test_extend_no_extension(self): # Extend should return the original results when there are no # extensions to make. input_data = os.path.join(self._data_dir, 'extend-no-extensions.csv') corpus = tacl.Corpus(os.path.join(self._stripped_dir, 'cbeta'), self._tokenizer) results = tacl.Results(input_data, self._tokenizer) results.extend(corpus) actual_rows = self._get_rows_from_results(results) expected_rows = self._get_rows_from_file(input_data) self.assertEqual(actual_rows, expected_rows)
def test_generate(self): expected_dir = os.path.join(self._data_dir, 'expected') catalogue = tacl.Catalogue() catalogue.load(os.path.join(self._data_dir, 'catalogue.txt')) results_path = os.path.join(self._data_dir, 'results.csv') tokenizer = tacl.Tokenizer(*tacl.constants.TOKENIZERS['cbeta']) results = tacl.Results(results_path, tokenizer) label = 'A' with tempfile.TemporaryDirectory() as temp_dir: report = tacl.LifetimeReport() report.generate(temp_dir, catalogue, results, label) self._compare_results_dirs(temp_dir, expected_dir)
def test_bifurcated_extend(self): self.maxDiff = None # This is a test of Results._bifurcated_extend, which does not # require any information other than the results themselves. input_data = ( ['AB', '2', 'a', 'base', '4', 'A', '7'], ['AB', '2', 'a', 'wit1', '5', 'A', '7'], ['AB', '2', 'b', 'base', '2', 'A', '7'], ['AB', '2', 'c', 'base', '4', 'B', '4'], ['ZAB', '3', 'a', 'base', '2', 'A', '3'], ['ZAB', '3', 'a', 'wit1', '2', 'A', '3'], ['ZAB', '3', 'b', 'base', '1', 'A', '3'], ['ABC', '3', 'a', 'base', '4', 'A', '5'], ['ABC', '3', 'a', 'wit1', '4', 'A', '5'], ['ABC', '3', 'b', 'base', '1', 'A', '5'], ['ABC', '3', 'c', 'base', '4', 'B', '4'], ['ZAB', '3', 'c', 'base', '2', 'B', '2'], ['XAB', '3', 'c', 'base', '2', 'B', '2'], ['ZABC', '4', 'a', 'base', '2', 'A', '2'], ['ZABC', '4', 'a', 'wit1', '2', 'A', '2'], ['ZABCD', '5', 'a', 'base', '1', 'A', '1'], ['ZABCD', '5', 'a', 'wit1', '1', 'A', '1'], ['ZABCDE', '6', 'a', 'base', '1', 'A', '1'], ) fieldnames = tuple( list(tacl.constants.QUERY_FIELDNAMES[:]) + [tacl.constants.LABEL_COUNT_FIELDNAME]) fh = self._create_csv(input_data, fieldnames=fieldnames) results = tacl.Results(fh, self._tokenizer) results._bifurcated_extend() expected_rows = [ fieldnames, ('AB', '2', 'a', 'base', '4', 'A', '7'), ('AB', '2', 'a', 'wit1', '5', 'A', '7'), ('AB', '2', 'b', 'base', '2', 'A', '7'), ('ZAB', '3', 'a', 'base', '2', 'A', '3'), ('ABC', '3', 'a', 'base', '4', 'A', '5'), ('ZAB', '3', 'a', 'wit1', '2', 'A', '3'), ('ABC', '3', 'a', 'wit1', '4', 'A', '5'), ('ZAB', '3', 'b', 'base', '1', 'A', '3'), ('ABC', '3', 'b', 'base', '1', 'A', '5'), ('ABC', '3', 'c', 'base', '4', 'B', '4'), ('ZAB', '3', 'c', 'base', '2', 'B', '2'), ('XAB', '3', 'c', 'base', '2', 'B', '2'), ('ZABC', '4', 'a', 'base', '2', 'A', '2'), ('ZABC', '4', 'a', 'wit1', '2', 'A', '2'), ('ZABCD', '5', 'a', 'base', '1', 'A', '1'), ('ZABCD', '5', 'a', 'wit1', '1', 'A', '1'), ] actual_rows = self._get_rows_from_results(results) self.assertEqual(actual_rows, expected_rows)
def test_zero_fill_no_duplicate_index_values(self): # Zero fill should not leave the matches with duplicate values # in the index, potentially raising a "cannot reindex from a # duplicate axis" ValueError when followed by another # operation. data_dir = os.path.join(os.path.dirname(__file__), 'data') corpus = tacl.Corpus(os.path.join(data_dir, 'stripped'), self._tokenizer) input_file = os.path.join(self._data_dir, 'non-zero-fill-results.csv') results = tacl.Results(input_file, self._tokenizer) results.zero_fill(corpus) self.assertFalse( results._matches.index.has_duplicates, 'Results._matches DataFrame is left with duplicate index values.')
def test_extend_no_duplicate_index_values(self): # Extend should not leave the matches with duplicate values in # the index, potentially raising a "cannot reindex from a # duplicate axis" ValueError when followed by another # operation. input_data = os.path.join(self._data_dir, 'cbeta-non-extend-results.csv') corpus = tacl.Corpus(os.path.join(self._stripped_dir, 'cbeta'), self._tokenizer) results = tacl.Results(input_data, self._tokenizer) results.extend(corpus) self.assertFalse( results._matches.index.has_duplicates, 'Results._matches DataFrame is left with duplicate index values.')
def _test_no_duplicate_index_values(self, cmd, *args, **kwargs): # No Results method should leave the matches with duplicate # values in the index, potentially raising a "cannot reindex # from a duplicate axis" ValueError when followed by another # operation. input_data = (['AB', '2', 'a', 'base', '4', 'A'], ['AB', '2', 'a', 'wit1', '5', 'A'], ['AB', '2', 'b', 'base', '3', 'A'], ['AB', '2', 'b', 'wit1', '3', 'A'], ['AB', '2', 'c', 'base', '2', 'B'], ['BC', '2', 'a', 'base', '2', 'A']) fh = self._create_csv(input_data) results = tacl.Results(fh, self._tokenizer) getattr(results, cmd)(*args, **kwargs) self.assertFalse(results._matches.index.has_duplicates)
def test_remove_label_missing_label(self): """Test removing a label that doesn't exist in the results.""" input_data = (['AB', '2', 'a', 'base', '4', 'A'], ['AB', '2', 'a', 'wit', '3', 'A'], ['ABC', '3', 'a', 'base', '2', 'A']) fh = self._create_csv(input_data) results = tacl.Results(fh, self._tokenizer) results.remove_label('C') expected_rows = [ tacl.constants.QUERY_FIELDNAMES, ('AB', '2', 'a', 'base', '4', 'A'), ('AB', '2', 'a', 'wit', '3', 'A'), ('ABC', '3', 'a', 'base', '2', 'A') ] actual_rows = self._get_rows_from_results(results) self.assertEqual(actual_rows, expected_rows)
def _concatenate_results(self, result_filenames): """Returns a `tacl.Results` containing all of the results from the files specified in `result_filenames`. :param result_filenames: filenames of results to concatenate :type result_filenames: `list` of `str` :rtype: `tacl.Results` """ results = [ pd.read_csv(filename, encoding='utf-8', na_filter=False) for filename in result_filenames ] combined = pd.concat(results, ignore_index=True) identifying_fields = list(tacl.constants.QUERY_FIELDNAMES) identifying_fields.remove(tacl.constants.LABEL_FIELDNAME) combined.drop_duplicates(subset=identifying_fields, inplace=True) return tacl.Results(combined, self._tokenizer)
def _get_results(self, catalogue): results = io.StringIO() if self._store is None: store = tacl.DataStore(':memory:', True) store.add_ngrams(self._corpus, self._minimum, self._maximum, catalogue) else: store = self._store self._logger.debug('Validating corpus/catalogue') store.validate(self._corpus, catalogue) self._logger.debug('Running intersection') store.intersection(catalogue, results) store = None results.seek(0) self._logger.debug('Generating results') results = tacl.Results(results, self._tokenizer) self._logger.debug('Extending results') results.extend(self._corpus) self._logger.debug('Reducing') results.reduce() return results
def _test_required_columns(self, cols, cmd, *args, **kwargs): """Tests that when `cmd` is run with `args` and `kwargs`, it raises a `MalformedResultsError when each of `cols` is not present in the results. Further tests that that exception is not raised when other columns are not present. This test is designed to test Results methods only. """ input_results = ( ['AB', '2', 'T1', 'base', '4', 'A'], ['AB', '2', 'T1', 'a', '3', 'A'], ['AB', '2', 'T2', 'base', '2', 'A'], ['ABC', '3', 'T1', 'base', '2', 'A'], ['ABC', '3', 'T1', 'a', '0', 'A'], ['AB', '2', 'T3', 'base', '2', 'B'], ['BC', '2', 'T1', 'base', '3', 'A'], ) for col in tacl.constants.QUERY_FIELDNAMES: fs = list(tacl.constants.QUERY_FIELDNAMES[:]) index = fs.index(col) fs[index] = 'dummy' fh = self._create_csv(input_results, fieldnames=fs) results = tacl.Results(fh, self._tokenizer) if col in cols: self.assertRaises(MalformedResultsError, getattr(results, cmd), *args, **kwargs) else: try: getattr(results, cmd)(*args, **kwargs) except MalformedResultsError: self.fail( 'Results.{} improperly raises MalformedResultsError ' 'when column "{}" not present in results'.format( cmd, col)) except KeyError as e: if str(e).strip('"\'') == col: self.fail( 'Results.{} requires column "{}" but does not ' 'specify this.'.format(cmd, col))
def test_excise(self): input_results = ( ['AB', '2', 'T1', 'wit1', '4', 'A'], ['AC', '2', 'T1', 'wit1', '3', 'A'], ['ABde', '3', 'T1', 'wit1', '1', 'A'], ['dDe', '3', 'T1', 'wit1', '2', 'A'], ['Dde', '3', 'T1', 'wit1', '1', 'A'], ['ABdeD', '4', 'T1', 'wit1', '1', 'A'], ['deAB', '3', 'T2', 'wit1', '2', 'B'], ['deAB', '3', 'T2', 'wit2', '2', 'B'], ) fh = self._create_csv(input_results) results = tacl.Results(fh, self._tokenizer) results.excise('de') expected_rows = [ tacl.constants.QUERY_FIELDNAMES, ('AB', '2', 'T1', 'wit1', '4', 'A'), ('AC', '2', 'T1', 'wit1', '3', 'A'), ('dDe', '3', 'T1', 'wit1', '2', 'A'), ] actual_rows = self._get_rows_from_results(results) self.assertEqual(actual_rows, expected_rows)
def _perform_reduce(self, input_data, tokenizer): fh = self._create_csv(input_data) results = tacl.Results(fh, tokenizer) results.reduce() return self._get_rows_from_results(results)
def test_reciprocal_remove(self): input_data = (['AB', '2', 'a', 'base', '5', 'A'], ['ABCDEF', '6', 'a', 'base', '7', 'A'], ['DEF', '3', 'a', 'base', '2', 'A'], ['GHIJ', '4', 'a', 'base', '3', 'A'], ['KLM', '3', 'b', 'base', '0', 'A'], ['ABCDEF', '6', 'b', 'base', '3', 'B'], ['GHIJ', '4', 'b', 'base', '2', 'B'], ['KLM', '3', 'b', 'base', '17', 'B']) fh = self._create_csv(input_data) results = tacl.Results(fh, self._tokenizer) results.reciprocal_remove() expected_rows = [ tacl.constants.QUERY_FIELDNAMES, ('ABCDEF', '6', 'a', 'base', '7', 'A'), ('GHIJ', '4', 'a', 'base', '3', 'A'), ('ABCDEF', '6', 'b', 'base', '3', 'B'), ('GHIJ', '4', 'b', 'base', '2', 'B') ] actual_rows = self._get_rows_from_results(results) self.assertEqual(set(actual_rows), set(expected_rows)) # More than two labels, and more than one text per label. input_data = (['AB', '2', 'a', 'base', '5', 'A'], ['ABCDEF', '6', 'a', 'base', '7', 'A'], ['DEF', '3', 'a', 'base', '2', 'A'], ['AB', '2', 'b', 'base', '6', 'A'], ['GHIJ', '4', 'b', 'base', '3', 'A'], ['KLM', '3', 'b', 'base', '0', 'A'], ['ABCDEF', '6', 'c', 'base', '3', 'B'], ['KLM', '3', 'c', 'base', '17', 'B'], ['GHIJ', '4', 'd', 'base', '2', 'B'], ['KLM', '3', 'e', 'base', '3', 'C'], ['GHIJ', '4', 'f', 'base', '11', 'C'], ['ABCDEF', '6', 'g', 'base', '8', 'C']) fh = self._create_csv(input_data) results = tacl.Results(fh, self._tokenizer) results.reciprocal_remove() expected_rows = [ tacl.constants.QUERY_FIELDNAMES, ('ABCDEF', '6', 'a', 'base', '7', 'A'), ('GHIJ', '4', 'b', 'base', '3', 'A'), ('ABCDEF', '6', 'c', 'base', '3', 'B'), ('GHIJ', '4', 'd', 'base', '2', 'B'), ('GHIJ', '4', 'f', 'base', '11', 'C'), ('ABCDEF', '6', 'g', 'base', '8', 'C') ] actual_rows = self._get_rows_from_csv( results.csv(io.StringIO(newline=''))) self.assertEqual(set(actual_rows), set(expected_rows)) # Now with variants. input_data = (['AB', '2', 'a', 'base', '5', 'A'], ['ABCDEF', '6', 'a', 'wit1', '7', 'A'], ['DEF', '3', 'a', 'base', '2', 'A'], ['AB', '2', 'b', 'base', '6', 'A'], ['GHIJ', '4', 'b', 'base', '3', 'A'], ['KLM', '3', 'b', 'base', '0', 'A'], ['ABCDEF', '6', 'c', 'base', '3', 'B'], ['KLM', '3', 'c', 'base', '17', 'B'], ['GHIJ', '4', 'd', 'base', '2', 'B'], ['KLM', '3', 'e', 'base', '3', 'C'], ['GHIJ', '4', 'f', 'wit2', '11', 'C'], ['ABCDEF', '6', 'g', 'base', '8', 'C']) fh = self._create_csv(input_data) results = tacl.Results(fh, self._tokenizer) results.reciprocal_remove() expected_rows = [ tacl.constants.QUERY_FIELDNAMES, ('ABCDEF', '6', 'a', 'wit1', '7', 'A'), ('GHIJ', '4', 'b', 'base', '3', 'A'), ('ABCDEF', '6', 'c', 'base', '3', 'B'), ('GHIJ', '4', 'd', 'base', '2', 'B'), ('GHIJ', '4', 'f', 'wit2', '11', 'C'), ('ABCDEF', '6', 'g', 'base', '8', 'C') ] actual_rows = self._get_rows_from_csv( results.csv(io.StringIO(newline=''))) self.assertEqual(set(actual_rows), set(expected_rows))