Example #1
0
 def testManyIdenticalCountPrefixes(self):
     """
     If many substrings' true positive fractions are the same as their
     subsubstrings (that are one character shorter), none of the longer
     substrings should appear in the results.
     """
     self.assertEqual(
         {
             'fractionTooLow': 0,
             'inferior': 5,
             'inputCount': 7,
             'notEnoughTruePositives': 0,
             'substrings': [
                 ('abc', (21, 21, 0.5)),
                 ('d', (10, 40, 0.2)),
             ],
         },
         selectSubstringsForAhoCorasick([
             'd 10 40',
             'abc 21 21',
             'abcd 21 21',
             'abcde 21 21',
             'abcdef 21 21',
             'abcdefg 21 21',
             'abcdefgh 21 21',
         ]))
Example #2
0
 def testMaxSubstringsNonZero(self):
     """
     A passed non-zero maxSubstrings value must be respected.
     """
     self.assertEqual(
         {
             'fractionTooLow':
             0,
             'inferior':
             0,
             'inputCount':
             8,
             'notEnoughTruePositives':
             0,
             'substrings': [
                 ('best', (10, 0, 1.0)),
                 ('abc', (20, 20, 0.5)),
                 ('defg', (20, 60, 0.25)),
                 ('hijkl', (20, 60, 0.25)),
             ],
         },
         selectSubstringsForAhoCorasick([
             'abc 20 20',
             'hijkl 20 60',
             'defg 20 60',
             'worst 1 9',
             'best 10 0',
             'stuvwx 20 60',
             'mnopqr 20 60',
             'm12345 20 60',
         ],
                                        maxSubstrings=4))
Example #3
0
 def testEmpty(self):
     """
     If no substrings are passed, the resulting substrings list must be
     empty and the counts must be zero.
     """
     self.assertEqual(
         {
             'fractionTooLow': 0,
             'inferior': 0,
             'inputCount': 0,
             'notEnoughTruePositives': 0,
             'substrings': [],
         }, selectSubstringsForAhoCorasick([]))
Example #4
0
 def testIdenticalFractionSubstringTwoShorter(self):
     """
     If a substring's true positive fraction is the same as that of a
     subsubstring (that is two characters shorter than the substring), the
     substring should not appear in the results.
     """
     self.assertEqual(
         {
             'fractionTooLow': 0,
             'inferior': 1,
             'inputCount': 2,
             'notEnoughTruePositives': 0,
             'substrings': [
                 ('abc', (21, 21, 0.5)),
             ],
         }, selectSubstringsForAhoCorasick([
             'abc 21 21',
             'abcde 21 21',
         ]))
Example #5
0
 def testAllowAll(self):
     """
     If there is no restriction on number or fraction of true positives
     all substrings must be returned.
     """
     self.assertEqual(
         {
             'fractionTooLow': 0,
             'inferior': 0,
             'inputCount': 2,
             'notEnoughTruePositives': 0,
             'substrings': [
                 ('abc', (21, 21, 0.5)),
                 ('def', (20, 80, 0.2)),
             ],
         }, selectSubstringsForAhoCorasick([
             'abc 21 21',
             'def 20 80',
         ]))
Example #6
0
 def testNonIdenticalFractionSubstringOneShorter(self):
     """
     If a substring's true positive fraction is better than that of one of
     its substrings (that is one character shorter than the substring),
     the substring should appear in the results.
     """
     self.assertEqual(
         {
             'fractionTooLow': 0,
             'inferior': 0,
             'inputCount': 2,
             'notEnoughTruePositives': 0,
             'substrings': [
                 ('abcd', (21, 21, 0.5)),
                 ('abc', (20, 60, 0.25)),
             ],
         }, selectSubstringsForAhoCorasick([
             'abc 20 60',
             'abcd 21 21',
         ]))
Example #7
0
 def testTruePositiveFraction(self):
     """
     If there is a restriction on the fraction of true positives
     the expected result must be returned.
     """
     self.assertEqual(
         {
             'fractionTooLow': 2,
             'inferior': 0,
             'inputCount': 3,
             'notEnoughTruePositives': 0,
             'substrings': [
                 ('abc', (21, 21, 0.5)),
             ],
         },
         selectSubstringsForAhoCorasick([
             'abc 21 21',
             'def 20 80',
             'ghi 10 80',
         ],
                                        minTruePositiveFraction=0.3))
Example #8
0
 def testMaxSubstringsZero(self):
     """
     A passed zero maxSubstrings value must be respected.
     """
     self.assertEqual(
         {
             'fractionTooLow': 0,
             'inferior': 0,
             'inputCount': 8,
             'notEnoughTruePositives': 0,
             'substrings': [],
         },
         selectSubstringsForAhoCorasick([
             'abc 20 20',
             'hijkl 20 60',
             'defg 20 60',
             'worst 1 9',
             'best 10 0',
             'stuvwx 20 60',
             'mnopqr 20 60',
             'm12345 20 60',
         ],
                                        maxSubstrings=0))
Example #9
0
 def testTruePositiveCountAndFraction(self):
     """
     If there is a restriction on both the number and fraction of true
     positives the expected result must be returned.
     """
     self.assertEqual(
         {
             'fractionTooLow': 1,
             'inferior': 0,
             'inputCount': 4,
             'notEnoughTruePositives': 2,
             'substrings': [
                 ('jkl', (30, 10, 0.75)),
             ],
         },
         selectSubstringsForAhoCorasick([
             'abc 21 21',
             'def 20 80',
             'ghi 10 80',
             'jkl 30 10',
         ],
                                        minTruePositives=21,
                                        minTruePositiveFraction=0.7))
Example #10
0
 def testSort(self):
     """
     Returned substrings must be sorted on true positive fraction
     (decreasing), length (increasing), and then alphabetically
     (increasing).
     """
     self.assertEqual(
         {
             'fractionTooLow':
             0,
             'inferior':
             0,
             'inputCount':
             8,
             'notEnoughTruePositives':
             0,
             'substrings': [
                 ('best', (10, 0, 1.0)),
                 ('abc', (20, 20, 0.5)),
                 ('defg', (20, 60, 0.25)),
                 ('hijkl', (20, 60, 0.25)),
                 ('m12345', (20, 60, 0.25)),
                 ('mnopqr', (20, 60, 0.25)),
                 ('stuvwx', (20, 60, 0.25)),
                 ('worst', (1, 9, 0.1)),
             ],
         },
         selectSubstringsForAhoCorasick([
             'abc 20 20',
             'hijkl 20 60',
             'defg 20 60',
             'worst 1 9',
             'best 10 0',
             'stuvwx 20 60',
             'mnopqr 20 60',
             'm12345 20 60',
         ]))
parser.add_argument(
    '--printCounts',
    default=False,
    action='store_true',
    help=('If True, the true positive count, false positive count, and true '
          'positive count / (false positive count + true positive count) '
          'fraction will be printed after each substring.'))

parser.add_argument(
    '--printSummary',
    default=False,
    action='store_true',
    help=('If True, print a summary of substring processing to show how many '
          'substrings were considered and what their fates were.'))

args = parser.parse_args()

result = selectSubstringsForAhoCorasick(sys.stdin, args.minTruePositives,
                                        args.minTruePositiveFraction,
                                        args.maxSubstrings)

if args.printCounts:
    for substring, counts in result['substrings']:
        print('%s %d %d %f' % (substring, counts[0], counts[1], counts[2]))
else:
    for substring, _ in result['substrings']:
        print(substring)

if args.printSummary:
    printSummary(result)