def test_Index_exact_match_ngrams_templates_perfect_minimalist(self):
        index_doc = [u'name is joker, {{}} name is joker']
        idx = index.Index(ngram_len=3)
        idx.index_one('tst', text_lines(index_doc), template=True)

        query_doc = [u'Hi my name is joker the joker name is joker yes.']
        #              012345678901234567890123456789012345678901234567
        #                        11111111112222222222333333333344444444
        expected = {
            'tst': [(Token(start=0,
                           start_line=0,
                           start_char=0,
                           end_line=0,
                           end_char=33,
                           end=5),
                     Token(start=2,
                           start_line=0,
                           start_char=6,
                           end_line=0,
                           end_char=43,
                           end=9))]
        }

        matches = idx.match(text_lines(query_doc))

        assert {} != matches
        for k, val in matches.items():
            assert expected[k] == val
Example #2
0
    def test_Index_match_unigrams_perfect(self):
        test_docs = self.get_test_docs('index/bsd')
        idx = self.get_test_index(test_docs, ngram_len=1, template=False)
        test_query_doc = self.get_test_loc('index/queryperfect')

        expected = {
            'bsd-new': [
                (Token(start=0, start_line=0, start_char=0, end_line=6, end_char=753, end=212),
                 Token(start=0, start_line=5, start_char=0, end_line=11, end_char=753, end=212))
            ]
        }

        test = dict(idx.match(text_lines(test_query_doc), perfect=True))

        self.assertNotEqual({}, test)
        for k, val in test.items():
            assert expected[k] == val

        with codecs.open(test_query_doc, encoding='utf-8') as td:
            actual = td.read().splitlines(True)
            expected = u''.join(actual[5:-2])[:-2]
            query_match_pos = test['bsd-new'][0][-1]
            tst = analysis.doc_subset(text_lines(location=test_query_doc), query_match_pos)
            tst = u''.join(tst)
            assert expected == tst
    def test_Index_exact_match_to_indexed_template_with_short_tokens_around_gaps(
            self):
        # was failing when a gapped token (from a template) starts at a
        # beginning of an index doc and at a position less than ngram length

        # setup
        idx = index.Index(ngram_len=4)
        index_doc = text_lines(self.get_test_loc('index/templates/idx.txt'))
        idx.index_one('idx', text_lines(index_doc), template=True)

        # test index
        quad_grams_index = idx._get_index_for_len(4)
        assert 205 == len(quad_grams_index)
        assert u'software without prior written' in quad_grams_index

        # test match
        query_doc = text_lines(self.get_test_loc('index/templates/query.txt'))
        matches = idx.match(query_doc)
        assert 1 == len(matches)

        # we expect a single match to the idx doc
        matched_query_doc_position = matches['idx'][0][1]
        expected = Token(start=0,
                         start_line=0,
                         start_char=0,
                         end_line=39,
                         end_char=34,
                         end=276)
        assert expected == matched_query_doc_position
Example #4
0
    def test_Index_exact_match_unigrams_perfect(self):
        test_docs = self.get_test_docs("index/bsd")
        idx = self.get_test_index(test_docs, ngram_len=1, template=False)
        test_query_doc = self.get_test_loc("index/queryperfect")

        expected = {
            "bsd-new": [
                (
                    Token(start=0, start_line=0, start_char=0, end_line=6, end_char=753, end=212),
                    Token(start=0, start_line=5, start_char=0, end_line=11, end_char=753, end=212),
                )
            ]
        }

        matches = idx.match(text_lines(test_query_doc))

        assert {} != matches
        for k, val in matches.items():
            assert expected[k] == val

        with codecs.open(test_query_doc, encoding="utf-8") as td:
            actual = td.read().splitlines(True)
            expected = u"".join(actual[5:-2])[:-2]
            query_match_pos = matches["bsd-new"][0][-1]
            tst = analysis.doc_subset(text_lines(location=test_query_doc), query_match_pos)
            tst = u"".join(tst)
            assert expected == tst
    def test_text_lines_from_list_or_location_yield_same_results(self):
        test_file = self.get_test_loc('analysis/bsd-new')
        with open(test_file, 'rb') as inf:
            test_strings_list = inf.read().splitlines(True)

        # test when we are passing a location or a list
        from_loc = list(text_lines(location=test_file))
        from_list = list(text_lines(location=test_strings_list))
        assert from_loc == from_list
    def test_text_lines_from_list_or_location_yield_same_results(self):
        test_file = self.get_test_loc('analysis/bsd-new')
        with open(test_file, 'rb') as inf:
            test_strings_list = inf.read().splitlines(True)

        # test when we are passing a location or a list
        from_loc = list(text_lines(location=test_file))
        from_list = list(text_lines(location=test_strings_list))
        assert from_loc == from_list
Example #7
0
    def match(self, location, minimum_score=100):
        """
        Match the file at location against the index and return a sequence of
        LicenseMatch.
        If minimum_score is less than 100, also include approximate matches.
        """
        if DEBUG:
            print('LicenseIndex.match: location=%(location)r, minimum_score=%(minimum_score)r' % locals())

        qdoc = analysis.text_lines(location)
        if DEBUG:
            qdoc = list(qdoc)
            print(' LicenseIndex.match: Query doc has %d lines.' % len(qdoc))
            qdoc = iter(qdoc)

        exact_matches = self.license_index.match(qdoc, minimum_score=minimum_score)
        if DEBUG:
            len_exact_matches = len(exact_matches)
            print(' LicenseIndex.match: exact_matches#: %(len_exact_matches)r' % locals())

        exact_license_matches = []
        for rule_id, matched_pos in exact_matches.items():
            rule = self.rules_by_id[rule_id]
            for match in matched_pos:
                index_position, query_position = match
                lmatch = LicenseMatch(rule, query_position, index_position, score=100.00)
                exact_license_matches.append(lmatch)
        if DEBUG:
            print(' LicenseIndex.match: unfiltered exact_license_matches: %(exact_license_matches)r' % locals())
        if DEBUG_FILTER:
            print(' in EXACT: LicenseIndex.match: filtered with filter_overlapping_matches')
        filtered_exact = filter_overlapping_matches(exact_license_matches, discard_negative=True)
        return sorted(filtered_exact, key=lambda x: x.span)
    def test_Index_exact_match_ngrams_template_perfect_multi_index_doc_in_index(
            self):
        test_docs = self.get_test_docs('index/bsd_templates')
        idx = self.get_test_index(test_docs, ngram_len=3, template=True)
        test_query_doc = self.get_test_loc(
            'index/queryperfect_single_template')

        expected = {
            'bsd-new': [(Token(start=0,
                               start_line=0,
                               start_char=0,
                               end_line=6,
                               end_char=753,
                               end=210),
                         Token(start=4,
                               start_line=5,
                               start_char=0,
                               end_line=11,
                               end_char=753,
                               end=216))]
        }
        matches = idx.match(text_lines(test_query_doc))
        assert {} != matches
        for k, val in matches.items():
            assert expected[k] == val
    def test_Index_exact_match_ngrams_perfect_single_index_doc_in_index_minimal(
            self):
        test_docs = self.get_test_docs('index/mini')
        idx = self.get_test_index(test_docs, ngram_len=3, template=False)
        test_query_doc = self.get_test_loc('index/queryperfect-mini')

        expected = {
            'bsd-new': [(Token(start=0,
                               start_line=0,
                               start_char=0,
                               end_line=0,
                               end_char=94,
                               end=13),
                         Token(start=1,
                               start_line=2,
                               start_char=0,
                               end_line=2,
                               end_char=94,
                               end=14))]
        }
        matches = idx.match(text_lines(test_query_doc))

        assert {} != matches
        for k, val in matches.items():
            assert expected[k] == val
    def test_Index_exact_match_ngrams_perfect_minimalist(self):
        index_doc = [u'name is joker, name is joker']
        #                 0  1     2     3  4     5
        idx = index.Index(ngram_len=3)
        idx.index_one('tst', text_lines(index_doc), template=False)

        query_doc = [u'Hi my name is joker, name is joker yes.']
        # match         0  1   |2  3     4     5  6     7|  8
        expected = {
            'tst': [(Token(start=0,
                           start_line=0,
                           start_char=0,
                           end_line=0,
                           end_char=28,
                           end=5),
                     Token(start=2,
                           start_line=0,
                           start_char=6,
                           end_line=0,
                           end_char=34,
                           end=7))]
        }
        matches = idx.match(query_doc)

        assert {} != matches
        for k, val in matches.items():
            #              assert [] == val
            assert expected[k] == val
Example #11
0
    def match(self, location, perfect=True):
        """
        Match the file at location against the index and return a sequence of
        LicenseMatch.
        If perfect is True, only return perfect matches.
        """
        if DEBUG:
            print('LicenseIndex.match: location=%(location)r, perfect=%(perfect)r ' % locals())

        qdoc = analysis.text_lines(location)
        if DEBUG:
            qdoc = list(qdoc)
            print(' LicenseIndex.match: Query doc has %d lines.'
                      % len(qdoc))
            print('  LicenseIndex.match: Query doc:')
            print(u''.join(qdoc))
            qdoc = iter(qdoc)
        matches = self.license_index.match(qdoc, perfect)

        license_matches = []
        for rule_id, matched_pos in matches.items():
            rule = self.rules_by_id[rule_id]
            for match in matched_pos:
                index_position, query_position = match
                lmatch = LicenseMatch(rule, query_position, index_position, score=100)
                license_matches.append(lmatch)
        return filter_matches(license_matches)
Example #12
0
    def test_Index_match_ngrams_templates_perfect_minimalist(self):
        index_doc = [u'name is joker, {{}} name is joker']
        idx = index.Index(ngram_len=3)
        idx.index_one('tst', text_lines(index_doc), template=True)

        query_doc = [u'Hi my name is joker the joker name is joker yes.']
        expected = {
            'tst': [
                (Token(start=0, start_line=0, start_char=0, end_line=0, end_char=33, end=5),
                 Token(start=2, start_line=0, start_char=6, end_line=0, end_char=43, end=9))
            ]
        }
        test = dict(idx.match(text_lines(query_doc), perfect=True))

        self.assertNotEqual({}, test)
        for k, val in test.items():
            assert expected[k] == val
    def test_get_tokens_count(self):
        base = self.get_test_loc('index/tokens_count', copy=True)
        docids = os.listdir(base)
        idx = index.Index(ngram_len=3)
        for docid in docids:
            doc = text_lines(location=os.path.join(base, docid))
            template = docid.startswith('tmpl')
            idx.index_one(docid, doc, template=template)
        indexes = [(
            idx.indexes[1],
            set(['all', 'redistribution', 'for', 'is']),
        ),
                   (
                       idx.indexes[2],
                       set([
                           'is allowed',
                           'all and',
                           'redistribution is',
                           'allowed for',
                       ]),
                   ),
                   (
                       idx.indexes[3],
                       set([
                           'for all and',
                           'and any thing',
                           'is allowed for',
                           'all and any',
                           'redistribution is allowed',
                           'allowed for all',
                       ]),
                   )]

        for idxi, expected_keys in indexes:
            assert expected_keys == set(idxi.keys())

        expected = {
            'plain1': 1,
            'plain2': 2,
            'plain3': 3,
            'plain4': 4,
            'plain5': 5,
            'tmpl10': 10,
            'tmpl2': 2,
            'tmpl3': 3,
            'tmpl4': 4,
            'tmpl5': 5,
            'tmpl5_2': 5,
            'tmpl6': 6,
            'tmpl7': 7,
            'tmpl8': 8,
            'tmpl9': 9,
        }

        result = {docid: idx.get_tokens_count(docid) for docid in docids}
        assert expected == result
def detect_copyrights(location):
    """
    Yield tuples of:
    (copyrights list, authors list, years list, holders list, start line, end line)
    detected in file at location.
    """
    detector = CopyrightDetector()
    for numbered_lines in candidate_lines(analysis.text_lines(location)):
        detected = detector.detect(numbered_lines)
        cp, auth, yr, hold, _start, _end = detected
        if any([cp, auth, yr, hold]):
            yield detected
Example #15
0
def detect_copyrights(location):
    """
    Yield tuples of:
    (copyrights list, authors list, years list, holders list, start line, end line)
    detected in file at location.
    """
    detector = CopyrightDetector()
    for numbered_lines in candidate_lines(analysis.text_lines(location)):
        detected = detector.detect(numbered_lines)
        cp, auth, yr, hold, _start, _end = detected
        if any([cp, auth, yr, hold]):
            yield detected
Example #16
0
    def test_Index_exact_match_ngrams_templates_perfect_minimalist(self):
        index_doc = [u"name is joker, {{}} name is joker"]
        idx = index.Index(ngram_len=3)
        idx.index_one("tst", text_lines(index_doc), template=True)

        query_doc = [u"Hi my name is joker the joker name is joker yes."]
        #              012345678901234567890123456789012345678901234567
        #                        11111111112222222222333333333344444444
        expected = {
            "tst": [
                (
                    Token(start=0, start_line=0, start_char=0, end_line=0, end_char=33, end=5),
                    Token(start=2, start_line=0, start_char=6, end_line=0, end_char=43, end=9),
                )
            ]
        }

        matches = idx.match(text_lines(query_doc))

        assert {} != matches
        for k, val in matches.items():
            assert expected[k] == val
    def test_get_tokens_count(self):
        base = self.get_test_loc('index/tokens_count', copy=True)
        docids = os.listdir(base)
        idx = index.Index(ngram_len=3)
        for docid in docids:
            doc = text_lines(location=os.path.join(base, docid))
            template = docid.startswith('tmpl')
            idx.index_one(docid, doc, template=template)
        indexes = [
            (idx.indexes[1], set([('all',),
                                  ('redistribution',),
                                  ('for',),
                                  ('is',)
                                 ]),),
            (idx.indexes[2], set([('is', 'allowed',),
                                  ('all', 'and',),
                                  ('redistribution', 'is',),
                                  ('allowed', 'for',),
                                 ]),),
            (idx.indexes[3], set([('for', 'all', 'and',),
                                  ('and', 'any', 'thing',),
                                  ('is', 'allowed', 'for',),
                                  ('all', 'and', 'any',),
                                  ('redistribution', 'is', 'allowed',),
                                  ('allowed', 'for', 'all',),
                                 ]),)
        ]

        for idxi, expected_keys in indexes:
            assert expected_keys == set(idxi.keys())

        expected = {
            'plain1': 1,
            'plain2': 2,
            'plain3': 3,
            'plain4': 4,
            'plain5': 5,
            'tmpl10': 10,
            'tmpl2': 2,
            'tmpl3': 3,
            'tmpl4': 4,
            'tmpl5': 5,
            'tmpl5_2': 5,
            'tmpl6': 6,
            'tmpl7': 7,
            'tmpl8': 8,
            'tmpl9': 9
        }

        result = {docid: idx.get_tokens_count(docid) for docid in docids}
        assert expected == result
Example #18
0
    def text(self):
        """
        Return the rule text loaded from its file.
        """
        # used for test only
        if self._text:
            return self._text

        elif self.text_file and exists(self.text_file):
            # IMPORTANT: use the same process as query text loading for symmetry
            lines = text_lines(self.text_file, demarkup=False)
            return ''.join(lines)
        else:
            raise Exception('Inconsistent rule text for:', self.identifier)
Example #19
0
    def text(self):
        """
        Return the rule text loaded from its file.
        """
        # used for test only
        if self._text:
            return self._text

        elif self.text_file and exists(self.text_file):
            # IMPORTANT: use the same process as query text loading for symmetry
            lines = text_lines(self.text_file, demarkup=False)
            return ''.join(lines)
        else:
            raise Exception('Inconsistent rule text for:', self.identifier)
Example #20
0
    def test_Index_exact_match_to_indexed_template_with_short_tokens_around_gaps(self):
        # was failing when a gapped token (from a template) starts at a
        # beginning of an index doc and at a position less than ngram length

        # setup
        idx = index.Index(ngram_len=4)
        index_doc = text_lines(self.get_test_loc("index/templates/idx.txt"))
        idx.index_one("idx", text_lines(index_doc), template=True)

        # test index
        quad_grams_index = idx._get_index_for_len(4)
        assert 205 == len(quad_grams_index)
        assert u"software without prior written" in quad_grams_index

        # test match
        query_doc = text_lines(self.get_test_loc("index/templates/query.txt"))
        matches = idx.match(query_doc)
        assert 1 == len(matches)

        # we expect a single match to the idx doc
        matched_query_doc_position = matches["idx"][0][1]
        expected = Token(start=0, start_line=0, start_char=0, end_line=39, end_char=34, end=276)
        assert expected == matched_query_doc_position
Example #21
0
    def test_get_tokens_count(self):
        base = self.get_test_loc("index/tokens_count", copy=True)
        docids = os.listdir(base)
        idx = index.Index(ngram_len=3)
        for docid in docids:
            doc = text_lines(location=os.path.join(base, docid))
            template = docid.startswith("tmpl")
            idx.index_one(docid, doc, template=template)
        indexes = [
            (idx.indexes[1], set(["all", "redistribution", "for", "is"])),
            (idx.indexes[2], set(["is allowed", "all and", "redistribution is", "allowed for"])),
            (
                idx.indexes[3],
                set(
                    [
                        "for all and",
                        "and any thing",
                        "is allowed for",
                        "all and any",
                        "redistribution is allowed",
                        "allowed for all",
                    ]
                ),
            ),
        ]

        for idxi, expected_keys in indexes:
            assert expected_keys == set(idxi.keys())

        expected = {
            "plain1": 1,
            "plain2": 2,
            "plain3": 3,
            "plain4": 4,
            "plain5": 5,
            "tmpl10": 10,
            "tmpl2": 2,
            "tmpl3": 3,
            "tmpl4": 4,
            "tmpl5": 5,
            "tmpl5_2": 5,
            "tmpl6": 6,
            "tmpl7": 7,
            "tmpl8": 8,
            "tmpl9": 9,
        }

        result = {docid: idx.get_tokens_count(docid) for docid in docids}
        assert expected == result
    def test_Index_match_ngrams_perfect_single_index_doc_in_index_minimal(self):
        test_docs = self.get_test_docs('index/mini')
        idx = self.get_test_index(test_docs, ngram_len=3, template=False)
        test_query_doc = self.get_test_loc('index/queryperfect-mini')

        expected = {
            'bsd-new': [
                (Token(start=0, start_line=0, start_char=0, end_line=0, end_char=94, end=13),
                 Token(start=1, start_line=2, start_char=0, end_line=2, end_char=94, end=14))
            ]
        }
        test = dict(idx.match(text_lines(test_query_doc), perfect=True))

        assert {} != test
        for k, val in test.items():
            assert expected[k] == val
    def test_Index_exact_match_return_one_match_with_correct_offsets(self):
        index_doc = [u'A one. A two. A three.']
        idx = index.Index(ngram_len=4)
        idx.index_one('tst', text_lines(index_doc), template=False)
        query_doc = [u'some junk. A one. A two. A three.']
        #                         1111111111222222222233
        #              012345678901234567890123456789012

        matches = idx.match(query_doc)
        match = matches['tst']
        assert 1 == len(match)
        index_pos, query_pos = match[0]
        assert 11 == query_pos.start_char
        assert 32 == query_pos.end_char
        assert 0 == index_pos.start_char
        assert 21 == index_pos.end_char
Example #24
0
    def test_Index_exact_match_return_one_match_with_correct_offsets(self):
        index_doc = [u"A one. A two. A three."]
        idx = index.Index(ngram_len=4)
        idx.index_one("tst", text_lines(index_doc), template=False)
        query_doc = [u"some junk. A one. A two. A three."]
        #                         1111111111222222222233
        #              012345678901234567890123456789012

        matches = idx.match(query_doc)
        match = matches["tst"]
        assert 1 == len(match)
        index_pos, query_pos = match[0]
        assert 11 == query_pos.start_char
        assert 32 == query_pos.end_char
        assert 0 == index_pos.start_char
        assert 21 == index_pos.end_char
Example #25
0
    def test_Index_match_ngrams_template_perfect_multi_index_doc_in_index(self):
        test_docs = self.get_test_docs('index/bsd_templates')
        idx = self.get_test_index(test_docs, ngram_len=3, template=True)
        test_query_doc = self.get_test_loc('index/queryperfect_single_template')

        expected = {
            'bsd-new':[
                 (Token(start=0, start_line=0, start_char=0, end_line=6, end_char=753, end=210),
                  Token(start=4, start_line=5, start_char=0, end_line=11, end_char=753, end=216))
            ]
        }

        test = dict(idx.match(text_lines(test_query_doc), perfect=True))

        self.assertNotEqual({}, test)
        for k, val in test.items():
            assert expected[k] == val
Example #26
0
    def match(self, location, minimum_score=100):
        """
        Match the file at location against the index and return a sequence of
        LicenseMatch.
        If minimum_score is less than 100, also include approximate matches.
        """
        if DEBUG:
            print(
                'LicenseIndex.match: location=%(location)r, minimum_score=%(minimum_score)r'
                % locals())

        qdoc = analysis.text_lines(location)
        if DEBUG:
            qdoc = list(qdoc)
            print(' LicenseIndex.match: Query doc has %d lines.' % len(qdoc))
            qdoc = iter(qdoc)

        exact_matches = self.license_index.match(qdoc,
                                                 minimum_score=minimum_score)
        if DEBUG:
            len_exact_matches = len(exact_matches)
            print(
                ' LicenseIndex.match: exact_matches#: %(len_exact_matches)r' %
                locals())

        exact_license_matches = []
        for rule_id, matched_pos in exact_matches.items():
            rule = self.rules_by_id[rule_id]
            for match in matched_pos:
                index_position, query_position = match
                lmatch = LicenseMatch(rule,
                                      query_position,
                                      index_position,
                                      score=100.00)
                exact_license_matches.append(lmatch)
        if DEBUG:
            print(
                ' LicenseIndex.match: unfiltered exact_license_matches: %(exact_license_matches)r'
                % locals())
        if DEBUG_FILTER:
            print(
                ' in EXACT: LicenseIndex.match: filtered with filter_overlapping_matches'
            )
        filtered_exact = filter_overlapping_matches(exact_license_matches,
                                                    discard_negative=False)
        return sorted(filtered_exact, key=lambda x: x.span)
Example #27
0
    def test_Index_match_simple(self):
        test_docs = self.get_test_docs('index/bsd')
        idx = self.get_test_index(test_docs, ngram_len=1)
        test_query_doc = self.get_test_loc('index/querysimple')
        expected = {
            'bsd-new':
                [(Token(start=0, start_line=0, start_char=0, end_line=6, end_char=753, end=212),
                  Token(start=0, start_line=4, start_char=0, end_line=12, end_char=607, end=212))
                ],
            'bsd-no-mod':
                [(Token(start=0, start_line=0, start_char=0, end_line=0, end_char=49, end=7),
                  Token(start=0, start_line=4, start_char=0, end_line=4, end_char=49, end=7))
                ],
            'bsd-original':
                [(Token(start=0, start_line=0, start_char=0, end_line=0, end_char=9, end=0),
                  Token(start=29, start_line=6, start_char=59, end_line=6, end_char=68, end=29)
                 ),
                 (Token(start=0, start_line=0, start_char=0, end_line=0, end_char=9, end=0),
                  Token(start=47, start_line=7, start_char=62, end_line=7, end_char=71, end=47)
                 ),
                 (Token(start=0, start_line=0, start_char=0, end_line=0, end_char=9, end=0),
                  Token(start=103, start_line=10, start_char=33, end_line=10, end_char=42, end=103)
                 ),
                 (Token(start=0, start_line=0, start_char=0, end_line=0, end_char=9, end=0),
                  Token(start=137, start_line=12, start_char=117, end_line=12, end_char=126, end=137)
                 )
                ],
            'bsd-original-uc':
                [(Token(start=0, start_line=0, start_char=0, end_line=0, end_char=9, end=0),
                  Token(start=29, start_line=6, start_char=59, end_line=6, end_char=68, end=29)),
                 (Token(start=0, start_line=0, start_char=0, end_line=0, end_char=9, end=0),
                  Token(start=47, start_line=7, start_char=62, end_line=7, end_char=71, end=47)),
                 (Token(start=0, start_line=0, start_char=0, end_line=0, end_char=9, end=0),
                  Token(start=103, start_line=10, start_char=33, end_line=10, end_char=42, end=103)),
                 (Token(start=0, start_line=0, start_char=0, end_line=0, end_char=9, end=0),
                  Token(start=137, start_line=12, start_char=117, end_line=12, end_char=126, end=137))
                ],
            'bsd-simplified':
                [(Token(start=0, start_line=0, start_char=3, end_line=7, end_char=73, end=67),
                  Token(start=0, start_line=4, start_char=0, end_line=7, end_char=207, end=67))
                 ]
        }

        test = dict(idx.match(text_lines(test_query_doc), perfect=True))
        for k, val in test.items():
            assert expected[k] == val
Example #28
0
    def test_Index_exact_match_ngrams_template_perfect_multi_index_doc_in_index(self):
        test_docs = self.get_test_docs("index/bsd_templates")
        idx = self.get_test_index(test_docs, ngram_len=3, template=True)
        test_query_doc = self.get_test_loc("index/queryperfect_single_template")

        expected = {
            "bsd-new": [
                (
                    Token(start=0, start_line=0, start_char=0, end_line=6, end_char=753, end=210),
                    Token(start=4, start_line=5, start_char=0, end_line=11, end_char=753, end=216),
                )
            ]
        }
        matches = idx.match(text_lines(test_query_doc))
        assert {} != matches
        for k, val in matches.items():
            assert expected[k] == val
Example #29
0
    def test_Index_exact_match_ngrams_perfect_single_index_doc_in_index_minimal(self):
        test_docs = self.get_test_docs("index/mini")
        idx = self.get_test_index(test_docs, ngram_len=3, template=False)
        test_query_doc = self.get_test_loc("index/queryperfect-mini")

        expected = {
            "bsd-new": [
                (
                    Token(start=0, start_line=0, start_char=0, end_line=0, end_char=94, end=13),
                    Token(start=1, start_line=2, start_char=0, end_line=2, end_char=94, end=14),
                )
            ]
        }
        matches = idx.match(text_lines(test_query_doc))

        assert {} != matches
        for k, val in matches.items():
            assert expected[k] == val
Example #30
0
def find(location, patterns):
    """
    Yield match and matched lines for patterns found in file at location. as a
    tuple of (key, found text, text line). Pattern is list of tuples (key,
    compiled regex).

    Note: the location can be a list of lines for testing convenience.
    """
    if DEBUG:
        loc = pformat(location)
        print('find(location=%(loc)r,\n  patterns=%(patterns)r)' % locals())

    for line in analysis.text_lines(location):
        for key, pattern in patterns:
            for match in pattern.findall(line):

                if DEBUG:
                    print('find: yielding match: key=%(key)r, '
                          'match=%(match)r,\n    line=%(line)r' % locals())
                yield key, unicode(match), line
Example #31
0
def find(location, patterns):
    """
    Yield match and matched lines for patterns found in file at location. as a
    tuple of (key, found text, text line). Pattern is list of tuples (key,
    compiled regex).

    Note: the location can be a list of lines for testing convenience.
    """
    if DEBUG:
        loc = pformat(location)
        print('find(location=%(loc)r,\n  patterns=%(patterns)r)' % locals())

    for line in analysis.text_lines(location):
        for key, pattern in patterns:
            for match in pattern.findall(line):

                if DEBUG:
                    print('find: yielding match: key=%(key)r, '
                          'match=%(match)r,\n    line=%(line)r' % locals())
                yield key, unicode(match), line
Example #32
0
    def test_Index_exact_match_ngrams_perfect_minimalist(self):
        index_doc = [u"name is joker, name is joker"]
        #                 0  1     2     3  4     5
        idx = index.Index(ngram_len=3)
        idx.index_one("tst", text_lines(index_doc), template=False)

        query_doc = [u"Hi my name is joker, name is joker yes."]
        # match         0  1   |2  3     4     5  6     7|  8
        expected = {
            "tst": [
                (
                    Token(start=0, start_line=0, start_char=0, end_line=0, end_char=28, end=5),
                    Token(start=2, start_line=0, start_char=6, end_line=0, end_char=34, end=7),
                )
            ]
        }
        matches = idx.match(query_doc)

        assert {} != matches
        for k, val in matches.items():
            #              assert [] == val
            assert expected[k] == val
def query_lines(location=None, query_string=None, strip=True):
    """
    Return an iterable of text lines given a file at `location` or a
    `query string`. Include empty lines.
    """
    # TODO: OPTIMIZE: tokenizing line by line may be rather slow
    # we could instead get lines and tokens at once in a batch?
    lines = []
    if location:
        lines = text_lines(location, demarkup=False)
    elif query_string:
        if strip:
            keepends = False
        else:
            keepends = True
        lines = query_string.splitlines(keepends)

    for line in lines:
        if strip:
            yield line.strip()
        else:
            yield line
Example #34
0
def query_lines(location=None, query_string=None, strip=True):
    """
    Return an iterable of text lines given a file at `location` or a
    `query string`. Include empty lines.
    """
    # TODO: OPTIMIZE: tokenizing line by line may be rather slow
    # we could instead get lines and tokens at once in a batch?
    lines = []
    if location:
        lines = text_lines(location, demarkup=False)
    elif query_string:
        if strip:
            keepends = False
        else:
            keepends = True
        lines = query_string.splitlines(keepends)

    for line in lines:
        if strip:
            yield line.strip()
        else:
            yield line
Example #35
0
    def test_Index_exact_match_simple(self):
        test_docs = self.get_test_docs("index/bsd")
        idx = self.get_test_index(test_docs, ngram_len=1)
        test_query_doc = self.get_test_loc("index/querysimple")
        expected = {
            "bsd-new": [
                (
                    Token(start=0, start_line=0, start_char=0, end_line=6, end_char=753, end=212),
                    Token(start=0, start_line=4, start_char=0, end_line=12, end_char=607, end=212),
                )
            ],
            "bsd-no-mod": [
                (
                    Token(start=0, start_line=0, start_char=0, end_line=0, end_char=49, end=7),
                    Token(start=0, start_line=4, start_char=0, end_line=4, end_char=49, end=7),
                )
            ],
            "bsd-original": [
                (
                    Token(start=0, start_line=0, start_char=0, end_line=0, end_char=9, end=0),
                    Token(start=29, start_line=6, start_char=59, end_line=6, end_char=68, end=29),
                ),
                (
                    Token(start=0, start_line=0, start_char=0, end_line=0, end_char=9, end=0),
                    Token(start=47, start_line=7, start_char=62, end_line=7, end_char=71, end=47),
                ),
                (
                    Token(start=0, start_line=0, start_char=0, end_line=0, end_char=9, end=0),
                    Token(start=103, start_line=10, start_char=33, end_line=10, end_char=42, end=103),
                ),
                (
                    Token(start=0, start_line=0, start_char=0, end_line=0, end_char=9, end=0),
                    Token(start=137, start_line=12, start_char=117, end_line=12, end_char=126, end=137),
                ),
            ],
            "bsd-original-uc": [
                (
                    Token(start=0, start_line=0, start_char=0, end_line=0, end_char=9, end=0),
                    Token(start=29, start_line=6, start_char=59, end_line=6, end_char=68, end=29),
                ),
                (
                    Token(start=0, start_line=0, start_char=0, end_line=0, end_char=9, end=0),
                    Token(start=47, start_line=7, start_char=62, end_line=7, end_char=71, end=47),
                ),
                (
                    Token(start=0, start_line=0, start_char=0, end_line=0, end_char=9, end=0),
                    Token(start=103, start_line=10, start_char=33, end_line=10, end_char=42, end=103),
                ),
                (
                    Token(start=0, start_line=0, start_char=0, end_line=0, end_char=9, end=0),
                    Token(start=137, start_line=12, start_char=117, end_line=12, end_char=126, end=137),
                ),
            ],
            "bsd-simplified": [
                (
                    Token(start=0, start_line=0, start_char=3, end_line=7, end_char=73, end=67),
                    Token(start=0, start_line=4, start_char=0, end_line=7, end_char=207, end=67),
                )
            ],
        }

        matches = idx.match(text_lines(test_query_doc))
        for k, val in matches.items():
            assert expected[k] == val
Example #36
0
 def get_test_docs(self, base, subset=None):
     base = self.get_test_loc(base, copy=True)
     for docid in os.listdir(base):
         if (subset and docid in subset) or not subset:
             yield docid, text_lines(location=os.path.join(base, docid))
 def test_some_media_do_yield_text_lines(self):
     test_dir = self.get_test_loc('media_with_text')
     for test_file in file_iter(test_dir):
         result = list(text_lines(test_file))
         assert result, 'Should return text lines:' + test_file
         assert any('nexb' in l for l in result)
 def test_some_media_do_not_yield_text_lines(self):
     test_dir = self.get_test_loc('media_without_text')
     for test_file in file_iter(test_dir):
         result = list(text_lines(test_file))
         assert [] == result, 'Should not return text lines:' + test_file
 def test_archives_do_not_yield_text_lines(self):
     test_file = self.get_test_loc('archive/simple.jar')
     result = list(text_lines(test_file))
     assert [] == result