Example #1
0
def test_length_of_vectors(new_vectors_path: Path) -> None:
    """
    Create a new database, add a whole bunch of files.
    """

    examples = dict(file_a=to_source_vector(b'print("hello, world!")'),
                    file_b=to_source_vector(b'import sys; sys.exit(0)'),
                    file_c=to_source_vector(b'print(934 * 2 * 3442990 + 1)'))

    # Insert all the examples.
    vectors = Vectors.from_filename(new_vectors_path)
    for name, vector in examples.items():
        vectors[name] = vector
    vectors.disconnect()

    # Reopen it and test the length
    vectors = Vectors.from_filename(new_vectors_path)

    # Test fetching all of them.
    actual = sum(len(vec) for vec in examples.values())
    assert actual == vectors.length_of_vectors({'file_a', 'file_b', 'file_c'})

    # Check that we can query an empty set.
    assert 0 == vectors.length_of_vectors(())

    # Check that we can query a single item.
    assert len(examples['file_a']) == vectors.length_of_vectors({'file_a'})

    # Check that we can query a subset.
    actual = sum(len(examples[name]) for name in ('file_a', 'file_c'))
    assert actual == vectors.length_of_vectors({'file_a', 'file_c'})
Example #2
0
def test_source_vector_unk_conversion():
    problematic_source = b'class _ { # }'
    with pytest.raises(OutOfVocabularyError):
        to_source_vector(problematic_source)

    vector = to_source_vector(problematic_source, oov_to_unk=True)
    assert 5 == len(vector)
    assert current_language.vocabulary.unk_token_index == vector[1] == vector[3]
Example #3
0
def test(dirname: Path = None) -> None:
    from sensibility._paths import REPOSITORY_ROOT
    from sensibility.source_vector import to_source_vector
    if dirname is None:
        dirname = REPOSITORY_ROOT / 'tests'

    language.set('java')
    model = KerasDualLSTMModel.from_directory(dirname)
    source = to_source_vector(rb'''
        package ca.ualberta.cs;

        class HelloWorld {
            public static void main(String args[] /* Syntax error, delete token[19] to fix */ ... ) {
                System.out.println("Hello, World!");
            }
        }
    ''')

    answer = model.predict_file(source)
    assert len(answer) == len(source)
    text = language.vocabulary.to_source_text
    for expected, predictions in zip(source, answer):
        actual_fw = text(predictions.forwards.argmax())  # type: ignore
        actual_bw = text(predictions.backwards.argmax())  # type: ignore
        print(f"{actual_fw:>14}\t{actual_bw:>14}\t{text(expected)}")
def test_insert(c) -> None:
    source_code = to_source_vector(b"""
    @SuppressWarnings({"fake", 0x1.8p1)
    class Hello {}
    """)
    edit = Insertion(7, to_index(c('}')))
    mutant = edit.apply(source_code)
    expected = b'@ ident ( { "string" , 0.0 } ) class ident { }'
    actual = mutant.to_source_code()
    assert expected == actual
    assert language.check_syntax(actual)
Example #5
0
    def fix(self, source_file: bytes) -> Sequence[Edit]:
        """
        Produces a ranked sequence of possible edits that will fix the file.
        If there are no possible fixes, the sequence will be empty.
        """
        # Get file vector for the error'd file.
        file_vector = to_source_vector(source_file, oov_to_unk=True)
        tokens = tuple(language.tokenize(source_file))
        predictions = self.model.predict_file(file_vector)

        # Holds the lowest agreement at each point in the file.
        results: List[IndexResult] = []

        for index, pred in enumerate(predictions):
            vind = file_vector[index]
            token = tokens[index]
            prefix_pred = pred.forwards
            suffix_pred = pred.backwards

            # Figure out the agreement between models, and against the ground
            # truth.
            result = IndexResult(index, file_vector, prefix_pred, suffix_pred,
                                 token, vind)
            results.append(result)

        # Rank the results by some metric of similarity defined by IndexResult
        # (the top rank will be LEAST similar).
        ranked_results = tuple(sorted(results, key=float))

        # For the top-k disagreements, synthesize fixes.
        # NOTE: k should be determined by the xentropy of the models!
        fixes = Fixes(file_vector)
        for disagreement in ranked_results[:self.k]:
            pos = disagreement.index

            likely_tokens = disagreement.best_suggestions()

            # Note: the order of these operations SHOULDN'T matter,
            # but typically we only report the first fix that works.
            # Because missing tokens are the most common
            # we'll try to insert tokens first, THEN delete.

            # Assume a deletion. Let's try inserting some tokens.
            for likely_token in likely_tokens:
                fixes.try_insert(pos, likely_token)

            # Assume an insertion. Let's try removing the offensive token.
            fixes.try_delete(pos)

            # Assume a substitution. Let's try swapping the token.
            for likely_token in likely_tokens:
                fixes.try_substitute(pos, likely_token)

        return tuple(fixes)
def test_delete(c) -> None:
    source_code = to_source_vector(b"""
    class Hello {
        }
    }
    """)
    edit = Deletion(3, to_index(c('}')))
    mutant = edit.apply(source_code)
    expected = b'class ident { }'
    actual = mutant.to_source_code()
    assert expected == actual
    assert language.check_syntax(actual)
def test_substitution(c) -> None:
    source_code = to_source_vector(b"""
    @SuppressWarnings("fake"=0x1.8p1)
    class Hello {}
    """)
    edit = Substitution(3,
                        original_token=to_index(c('"fake"')),
                        replacement=to_index(c('ident')))
    mutant = edit.apply(source_code)
    expected = b'@ ident ( ident = 0.0 ) class ident { }'
    actual = mutant.to_source_code()
    assert expected == actual
    assert language.check_syntax(actual)
Example #8
0
def test_creates_file(new_vectors_path: Path) -> None:
    """
    Create a new vector database, and test that reconnecting to it persists
    changes.
    """
    hello_vector = to_source_vector(b'print("hello, world!")')
    vectors = Vectors.from_filename(new_vectors_path)
    vectors['hello'] = hello_vector
    vectors.disconnect()

    vectors = Vectors.from_filename(new_vectors_path)
    assert hello_vector == vectors['hello']

    with pytest.raises(KeyError):
        vectors['non-existent']