Exemple #1
0
def test_vectorizer_dependencyPathEdgesNearEntities_noTFIDF():
	corpus1, _ = generateTestData(positiveCount=5,negativeCount=5)
	corpus2, _ = generateTestData(positiveCount=10,negativeCount=10)

	candidateBuilder = kindred.CandidateBuilder()
	candidateBuilder.fit_transform(corpus1)
	candidateBuilder.transform(corpus2)

	chosenFeatures = ["dependencyPathEdgesNearEntities"]
	vectorizer = kindred.Vectorizer(featureChoice=chosenFeatures,tfidf=False)
	
	matrix1 = vectorizer.fit_transform(corpus1)
	matrix2 = vectorizer.transform(corpus2)
	
	assert matrix1.shape == (8,6)
	assert matrix2.shape == (18,6)
	
	colnames = vectorizer.getFeatureNames()
	expectedNames = [u'dependencypathnearselectedtoken_0_dobj', u'dependencypathnearselectedtoken_0_nsubj', u'dependencypathnearselectedtoken_0_nsubjpass', u'dependencypathnearselectedtoken_1_dobj', u'dependencypathnearselectedtoken_1_nsubj', u'dependencypathnearselectedtoken_1_nsubjpass']
	assert colnames == expectedNames
	
	# As a quick check, we'll confirm that the column means are as expected
	colmeans1 = np.sum(matrix1,axis=0)
	assert colmeans1.tolist() == [[1.0, 2.0, 1.0, 1.0, 2.0, 1.0]]
	
	# As a quick check, we'll confirm that the column means are as expected
	colmeans2 = np.sum(matrix2,axis=0)
	assert colmeans2.tolist() == [[0.0, 5.0, 1.0, 0.0, 5.0, 1.0]]
Exemple #2
0
def test_vectorizer_defaults():
	corpus1, _ = generateTestData(positiveCount=5,negativeCount=5)
	corpus2, _ = generateTestData(positiveCount=10,negativeCount=10)

	parser = kindred.Parser()
	parser.parse(corpus1)
	parser.parse(corpus2)
	
	candidateBuilder = kindred.CandidateBuilder()
	candidateRelations1 = candidateBuilder.build(corpus1)
	candidateRelations2 = candidateBuilder.build(corpus2)

	vectorizer = kindred.Vectorizer()
	
	matrix1 = vectorizer.fit_transform(candidateRelations1)
	matrix2 = vectorizer.transform(candidateRelations2)

	colnames = vectorizer.getFeatureNames()
	
	# As a quick check, we'll confirm that the column means are as expected
	colmeans1 = np.sum(matrix1,axis=0).tolist()[0]
	namedCols1 = { col:round(v,8) for col,v in zip(colnames,colmeans1) }
	check('test_vectorizer_defaults_1',namedCols1)
	colmeans2 = np.sum(matrix2,axis=0).tolist()[0]
	namedCols2 = { col:round(v,8) for col,v in zip(colnames,colmeans2) }
	check('test_vectorizer_defaults_2',namedCols2)
Exemple #3
0
def test_vectorizer_dependencyPathEdgesNearEntities_noTFIDF():
	corpus1, _ = generateTestData(positiveCount=5,negativeCount=5)
	corpus2, _ = generateTestData(positiveCount=10,negativeCount=10)

	parser = kindred.Parser()
	parser.parse(corpus1)
	parser.parse(corpus2)
	
	candidateBuilder = kindred.CandidateBuilder()
	candidateRelations1 = candidateBuilder.build(corpus1)
	candidateRelations2 = candidateBuilder.build(corpus2)

	chosenFeatures = ["dependencyPathEdgesNearEntities"]
	vectorizer = kindred.Vectorizer(featureChoice=chosenFeatures,tfidf=False)
	
	matrix1 = vectorizer.fit_transform(candidateRelations1)
	matrix2 = vectorizer.transform(candidateRelations2)
	
	colnames = vectorizer.getFeatureNames()

	# As a quick check, we'll confirm that the column means are as expected
	colmeans1 = np.sum(matrix1,axis=0).tolist()[0]
	namedCols1 = { col:round(v,8) for col,v in zip(colnames,colmeans1) }
	check('test_vectorizer_dependencyPathEdgesNearEntities_noTFIDF_1',namedCols1)
	colmeans2 = np.sum(matrix1,axis=0).tolist()[0]
	namedCols2 = { col:round(v,8) for col,v in zip(colnames,colmeans2) }
	check('test_vectorizer_dependencyPathEdgesNearEntities_noTFIDF_2',namedCols2)
Exemple #4
0
def test_simpleVectorizer_triple():
    text = '<drug id="1">Erlotinib</drug> is a common treatment for <cancer id="2">NSCLC</cancer> which targets <gene id="3">EGFR</gene>. <relation type="druginfo" drug="1" disease="2" gene="3" />'

    corpus = kindred.Corpus(text, loadFromSimpleTag=True)

    parser = kindred.Parser()
    parser.parse(corpus)

    candidateBuilder = kindred.CandidateBuilder(entityCount=3)
    candidateRelations = candidateBuilder.build(corpus)

    # We'll just get the vectors for the entityTypes
    vectorizer = kindred.Vectorizer(entityCount=3,
                                    featureChoice=["entityTypes"])
    vectors = vectorizer.fit_transform(candidateRelations)

    assert vectors.shape == (6, 9)

    expected = [(0, 1), (0, 3), (0, 8), (1, 1), (1, 5), (1, 6), (2, 0), (2, 4),
                (2, 8), (3, 0), (3, 5), (3, 7), (4, 2), (4, 4), (4, 6), (5, 2),
                (5, 3), (5, 7)]

    rows, cols = vectors.nonzero()
    rowsWithCols = list(zip(rows.tolist(), cols.tolist()))
    assert sorted(expected) == sorted(rowsWithCols)

    vectorsCSR = vectors.tocsr()
    for r, c in expected:
        assert vectorsCSR[r, c] == 1.0
Exemple #5
0
def test_vectorizer_bigrams_noTFIDF():
	corpus1, _ = generateTestData(positiveCount=5,negativeCount=5)
	corpus2, _ = generateTestData(positiveCount=10,negativeCount=10)

	candidateBuilder = kindred.CandidateBuilder()
	candidateBuilder.fit_transform(corpus1)
	candidateBuilder.transform(corpus2)

	chosenFeatures = ["bigrams"]
	vectorizer = kindred.Vectorizer(featureChoice=chosenFeatures,tfidf=False)
	
	matrix1 = vectorizer.fit_transform(corpus1)
	matrix2 = vectorizer.transform(corpus2)
	
	assert matrix1.shape == (8,27)
	assert matrix2.shape == (18,27)
	
	colnames = vectorizer.getFeatureNames()
	expectedNames = [u'bigrams_ _gnorcyvmer', u'bigrams_a_common', u'bigrams_a_known', u'bigrams_be_treated', u'bigrams_bmzvpvwbpw_failed', u'bigrams_can_be', u'bigrams_clinical_trials', u'bigrams_common_treatment', u'bigrams_effect_of', u'bigrams_failed_clinical', u'bigrams_for_kyekjnkrfo', u'bigrams_for_zgwivlcmly', u'bigrams_gnorcyvmer_is', u'bigrams_is_a', u'bigrams_known_side', u'bigrams_kyekjnkrfo_.', u'bigrams_of_ruswdgzajr', u'bigrams_ootopaoxbg_can', u'bigrams_pehhjnlvvewbjccovflf_is', u'bigrams_ruswdgzajr_.', u'bigrams_side_effect', u'bigrams_treated_with', u'bigrams_treatment_for', u'bigrams_trials_for', u'bigrams_vgypkemhjr_.', u'bigrams_with_vgypkemhjr', u'bigrams_zgwivlcmly_.']
	assert colnames == expectedNames
	
	# As a quick check, we'll confirm that the column means are as expected
	expected1 = [4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 8.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0]
	colmeans1 = np.sum(matrix1,axis=0).tolist()[0]
	assert len(expected1) == len(colmeans1)
	for gotVal,expectedVal in zip(colmeans1,expected1):
		assert round(gotVal,8) == round(expectedVal,8) # Check rounded values (for floating point comparison issue)
		
	# As a quick check, we'll confirm that the column means are as expected
	expected2 = [0.0, 4.0, 0.0, 4.0, 4.0, 4.0, 8.0, 4.0, 0.0, 8.0, 0.0, 0.0, 0.0, 4.0, 0.0, 4.0, 0.0, 4.0, 4.0, 0.0, 0.0, 4.0, 4.0, 8.0, 4.0, 4.0, 0.0]
	colmeans2 = np.sum(matrix2,axis=0).tolist()[0]
	assert len(expected2) == len(colmeans2)
	for gotVal,expectedVal in zip(colmeans2,expected2):
		assert round(gotVal,8) == round(expectedVal,8) # Check rounded values (for floating point comparison issue)
Exemple #6
0
def test_vectorizer_defaults():
	corpus1, _ = generateTestData(positiveCount=5,negativeCount=5)
	corpus2, _ = generateTestData(positiveCount=10,negativeCount=10)

	candidateBuilder = kindred.CandidateBuilder()
	candidateBuilder.fit_transform(corpus1)
	candidateBuilder.transform(corpus2)

	candidates = corpus1.getCandidateRelations()

	vectorizer = kindred.Vectorizer()
	
	matrix1 = vectorizer.fit_transform(corpus1)
	matrix2 = vectorizer.transform(corpus2)

	assert matrix1.shape == (8,61)
	assert matrix2.shape == (18,61)
		
	colnames = vectorizer.getFeatureNames()
	expectedNames = [u'selectedtokentypes_0_disease', u'selectedtokentypes_0_disease2', u'selectedtokentypes_0_drug', u'selectedtokentypes_1_disease', u'selectedtokentypes_1_disease2', u'selectedtokentypes_1_drug', u'ngrams_betweenentities_a', u'ngrams_betweenentities_be', u'ngrams_betweenentities_can', u'ngrams_betweenentities_clinical', u'ngrams_betweenentities_common', u'ngrams_betweenentities_effect', u'ngrams_betweenentities_failed', u'ngrams_betweenentities_for', u'ngrams_betweenentities_is', u'ngrams_betweenentities_known', u'ngrams_betweenentities_of', u'ngrams_betweenentities_side', u'ngrams_betweenentities_treated', u'ngrams_betweenentities_treatment', u'ngrams_betweenentities_trials', u'ngrams_betweenentities_with', u'bigrams_ _gnorcyvmer', u'bigrams_a_common', u'bigrams_a_known', u'bigrams_be_treated', u'bigrams_bmzvpvwbpw_failed', u'bigrams_can_be', u'bigrams_clinical_trials', u'bigrams_common_treatment', u'bigrams_effect_of', u'bigrams_failed_clinical', u'bigrams_for_kyekjnkrfo', u'bigrams_for_zgwivlcmly', u'bigrams_gnorcyvmer_is', u'bigrams_is_a', u'bigrams_known_side', u'bigrams_kyekjnkrfo_.', u'bigrams_of_ruswdgzajr', u'bigrams_ootopaoxbg_can', u'bigrams_pehhjnlvvewbjccovflf_is', u'bigrams_ruswdgzajr_.', u'bigrams_side_effect', u'bigrams_treated_with', u'bigrams_treatment_for', u'bigrams_trials_for', u'bigrams_vgypkemhjr_.', u'bigrams_with_vgypkemhjr', u'bigrams_zgwivlcmly_.', u'dependencypathelements_attr', u'dependencypathelements_dobj', u'dependencypathelements_nsubj', u'dependencypathelements_nsubjpass', u'dependencypathelements_pobj', u'dependencypathelements_prep', u'dependencypathnearselectedtoken_0_dobj', u'dependencypathnearselectedtoken_0_nsubj', u'dependencypathnearselectedtoken_0_nsubjpass', u'dependencypathnearselectedtoken_1_dobj', u'dependencypathnearselectedtoken_1_nsubj', u'dependencypathnearselectedtoken_1_nsubjpass']
	assert colnames == expectedNames
	
	# As a quick check, we'll confirm that the column means are as expected
	expected1 = [2.0, 2.0, 4.0, 2.0, 2.0, 4.0, 1.4519522547520485, 1.0, 1.0, 1.0581526716744893, 1.037330992404908, 0.8817459917627732, 1.0581526716744893, 1.5854195824122916, 1.4519522547520485, 0.8817459917627732, 0.8817459917627732, 0.8817459917627732, 1.0, 1.037330992404908, 1.0581526716744893, 1.0, 0.6830902801437798, 0.7801302536256829, 0.6830902801437798, 0.8164965809277259, 0.8164965809277259, 0.8164965809277259, 0.8164965809277259, 0.7801302536256829, 0.6830902801437798, 0.8164965809277259, 0.8164965809277259, 0.7801302536256829, 0.6830902801437798, 1.1070563456981333, 0.6830902801437798, 0.8164965809277259, 0.6830902801437798, 0.8164965809277259, 0.7801302536256829, 0.6830902801437798, 0.6830902801437798, 0.8164965809277259, 0.7801302536256829, 0.8164965809277259, 0.8164965809277259, 0.8164965809277259, 0.7801302536256829, 4.0, 2.0, 4.0, 2.0, 8.0, 8.0, 1.0, 2.0, 1.0, 1.0, 2.0, 1.0]
	colmeans1 = np.sum(matrix1,axis=0).tolist()[0]
	assert len(expected1) == len(colmeans1)
	for gotVal,expectedVal in zip(colmeans1,expected1):
		assert round(gotVal,8) == round(expectedVal,8) # Check rounded values (for floating point comparison issue)
		
	# As a quick check, we'll confirm that the column means are as expected
	expected2 = [5.0, 4.0, 9.0, 5.0, 4.0, 9.0, 0.7848330659854781, 1.0, 1.0, 2.1163053433489787, 1.037330992404908, 0.0, 2.1163053433489787, 2.386006098839105, 1.99154811934689, 0.0, 1.594941622753311, 0.0, 1.0, 1.037330992404908, 2.1163053433489787, 1.0, 0.0, 1.0581526716744893, 0.0, 0.8164965809277259, 1.0, 0.8164965809277259, 2.1547005383792515, 1.0581526716744893, 0.0, 2.1547005383792515, 0.0, 0.0, 0.0, 0.8005865164268136, 0.0, 2.0, 0.0, 0.8164965809277259, 2.0, 0.0, 0.0, 0.8164965809277259, 1.0581526716744893, 2.1547005383792515, 0.8164965809277259, 0.8164965809277259, 0.0, 4.0, 4.0, 16.0, 2.0, 10.0, 10.0, 0.0, 5.0, 1.0, 0.0, 5.0, 1.0]
	colmeans2 = np.sum(matrix2,axis=0).tolist()[0]
	assert len(expected2) == len(colmeans2)
	for gotVal,expectedVal in zip(colmeans2,expected2):
		assert round(gotVal,8) == round(expectedVal,8) # Check rounded values (for floating point comparison issue)
def test_candidatebuilder_simple():
    text = '<drug id="1">Erlotinib</drug> is a common treatment for <cancer id="2">NSCLC</cancer>. <drug id="3">Aspirin</drug> is the main cause of <disease id="4">boneitis</disease>. <relation type="treats" subj="1" obj="2" />'

    corpus = kindred.Corpus(text, loadFromSimpleTag=True)

    candidateBuilder = kindred.CandidateBuilder()
    candidateBuilder.fit_transform(corpus)

    assert corpus.relationTypes == [('treats', 'obj', 'subj')]
    candidateRelations = corpus.getCandidateRelations(2)
    candidateClasses = corpus.getCandidateClasses(2)

    assert candidateClasses == [[0], [1], [0], [0]]
    assert len(candidateRelations) == 4

    sourceEntityIDsToEntityIDs = corpus.documents[
        0].getSourceEntityIDsToEntityIDs()

    assert candidateRelations[0].entityIDs == [
        sourceEntityIDsToEntityIDs['1'], sourceEntityIDsToEntityIDs['2']
    ]
    assert candidateRelations[1].entityIDs == [
        sourceEntityIDsToEntityIDs['2'], sourceEntityIDsToEntityIDs['1']
    ]
    assert candidateRelations[2].entityIDs == [
        sourceEntityIDsToEntityIDs['3'], sourceEntityIDsToEntityIDs['4']
    ]
    assert candidateRelations[3].entityIDs == [
        sourceEntityIDsToEntityIDs['4'], sourceEntityIDsToEntityIDs['3']
    ]
Exemple #8
0
def test_vectorizer_unigramsBetweenEntities_noTFIDF():
	corpus1, _ = generateTestData(positiveCount=5,negativeCount=5)
	corpus2, _ = generateTestData(positiveCount=10,negativeCount=10)

	candidateBuilder = kindred.CandidateBuilder()
	candidateBuilder.fit_transform(corpus1)
	candidateBuilder.transform(corpus2)

	chosenFeatures = ["unigramsBetweenEntities"]
	vectorizer = kindred.Vectorizer(featureChoice=chosenFeatures,tfidf=False)
	
	matrix1 = vectorizer.fit_transform(corpus1)
	matrix2 = vectorizer.transform(corpus2)
	
	assert matrix1.shape == (8,16)
	assert matrix2.shape == (18,16)
			
	colnames = vectorizer.getFeatureNames()
	expectedNames = [u'ngrams_betweenentities_a', u'ngrams_betweenentities_be', u'ngrams_betweenentities_can', u'ngrams_betweenentities_clinical', u'ngrams_betweenentities_common', u'ngrams_betweenentities_effect', u'ngrams_betweenentities_failed', u'ngrams_betweenentities_for', u'ngrams_betweenentities_is', u'ngrams_betweenentities_known', u'ngrams_betweenentities_of', u'ngrams_betweenentities_side', u'ngrams_betweenentities_treated', u'ngrams_betweenentities_treatment', u'ngrams_betweenentities_trials', u'ngrams_betweenentities_with']
	assert colnames == expectedNames
	
	# As a quick check, we'll confirm that the column means are as expected
	expected1 = [4.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 4.0, 4.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0]
	colmeans1 = np.sum(matrix1,axis=0).tolist()[0]
	assert len(expected1) == len(colmeans1)
	for gotVal,expectedVal in zip(colmeans1,expected1):
		assert round(gotVal,8) == round(expectedVal,8) # Check rounded values (for floating point comparison issue)
		
	expected2 = [4.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 4.0, 4.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0]
	colmeans2 = np.sum(matrix1,axis=0).tolist()[0]
	assert len(expected2) == len(colmeans2)
	for gotVal,expectedVal in zip(colmeans2,expected2):
		assert round(gotVal,8) == round(expectedVal,8) # Check rounded values (for floating point comparison issue)
Exemple #9
0
def test_vectorizer_entityTypes_noTFIDF():
    corpus1, _ = generateTestData(positiveCount=5, negativeCount=5)
    corpus2, _ = generateTestData(positiveCount=10, negativeCount=10)

    parser = kindred.Parser()
    parser.parse(corpus1)
    parser.parse(corpus2)

    candidateBuilder = kindred.CandidateBuilder()
    candidateRelations1 = candidateBuilder.build(corpus1)
    candidateRelations2 = candidateBuilder.build(corpus2)

    chosenFeatures = ["entityTypes"]
    vectorizer = kindred.Vectorizer(featureChoice=chosenFeatures, tfidf=False)

    matrix1 = vectorizer.fit_transform(candidateRelations1)
    matrix2 = vectorizer.transform(candidateRelations2)

    assert matrix1.shape == (8, 6)
    assert matrix2.shape == (18, 6)

    colnames = vectorizer.getFeatureNames()
    expectedNames = [
        'selectedtokentypes_0_disease', 'selectedtokentypes_0_disease2',
        'selectedtokentypes_0_drug', 'selectedtokentypes_1_disease',
        'selectedtokentypes_1_disease2', 'selectedtokentypes_1_drug'
    ]
    assert colnames == expectedNames

    # As a quick check, we'll confirm that the column means are as expected
    colmeans1 = np.sum(matrix1, axis=0)
    assert colmeans1.tolist() == [[2, 2, 4, 2, 2, 4]]

    colmeans2 = np.sum(matrix2, axis=0)
    assert colmeans2.tolist() == [[5, 4, 9, 5, 4, 9]]
Exemple #10
0
def test_simpleVectorizer_binary():
    text = '<drug id="1">Erlotinib</drug> is a common treatment for <cancer id="2">NSCLC</cancer>. <drug id="3">Aspirin</drug> is the main cause of <disease id="4">boneitis</disease> . <relation type="treats" subj="1" obj="2" />'

    corpus = kindred.Corpus(text, loadFromSimpleTag=True)

    parser = kindred.Parser()
    parser.parse(corpus)

    candidateBuilder = kindred.CandidateBuilder()
    candidateRelations = candidateBuilder.build(corpus)

    # We'll just get the vectors for the entityTypes
    vectorizer = kindred.Vectorizer(featureChoice=["entityTypes"])
    vectors = vectorizer.fit_transform(candidateRelations)

    assert vectors.shape == (4, 6)

    expected = [(0, 2), (1, 0), (2, 2), (3, 1), (0, 3), (1, 5), (2, 4), (3, 5)]

    rows, cols = vectors.nonzero()
    rowsWithCols = list(zip(rows.tolist(), cols.tolist()))
    assert sorted(expected) == sorted(rowsWithCols)

    vectorsCSR = vectors.tocsr()
    for r, c in expected:
        assert vectorsCSR[r, c] == 1.0
Exemple #11
0
def test_vectorizer_unigramsBetweenEntities():
    corpus1, _ = generateTestData(positiveCount=5, negativeCount=5)
    corpus2, _ = generateTestData(positiveCount=10, negativeCount=10)

    parser = kindred.Parser()
    parser.parse(corpus1)
    parser.parse(corpus2)

    candidateBuilder = kindred.CandidateBuilder()
    candidateRelations1 = candidateBuilder.build(corpus1)
    candidateRelations2 = candidateBuilder.build(corpus2)

    chosenFeatures = ["unigramsBetweenEntities"]
    vectorizer = kindred.Vectorizer(featureChoice=chosenFeatures, tfidf=True)

    matrix1 = vectorizer.fit_transform(candidateRelations1)
    matrix2 = vectorizer.transform(candidateRelations2)

    assert matrix1.shape == (8, 16)
    assert matrix2.shape == (18, 16)

    colnames = vectorizer.getFeatureNames()
    expectedNames = [
        u'ngrams_betweenentities_a', u'ngrams_betweenentities_be',
        u'ngrams_betweenentities_can', u'ngrams_betweenentities_clinical',
        u'ngrams_betweenentities_common', u'ngrams_betweenentities_effect',
        u'ngrams_betweenentities_failed', u'ngrams_betweenentities_for',
        u'ngrams_betweenentities_is', u'ngrams_betweenentities_known',
        u'ngrams_betweenentities_of', u'ngrams_betweenentities_side',
        u'ngrams_betweenentities_treated', u'ngrams_betweenentities_treatment',
        u'ngrams_betweenentities_trials', u'ngrams_betweenentities_with'
    ]
    assert colnames == expectedNames

    # As a quick check, we'll confirm that the column means are as expected
    expected1 = [
        1.4519522547520485, 1.0, 1.0, 1.0581526716744893, 1.037330992404908,
        0.8817459917627732, 1.0581526716744893, 1.5854195824122916,
        1.4519522547520485, 0.8817459917627732, 0.8817459917627732,
        0.8817459917627732, 1.0, 1.037330992404908, 1.0581526716744893, 1.0
    ]
    colmeans1 = np.sum(matrix1, axis=0).tolist()[0]
    assert len(expected1) == len(colmeans1)
    for gotVal, expectedVal in zip(colmeans1, expected1):
        assert round(gotVal, 8) == round(
            expectedVal,
            8)  # Check rounded values (for floating point comparison issue)

    expected2 = [
        1.4519522547520485, 1.0, 1.0, 1.0581526716744893, 1.037330992404908,
        0.8817459917627732, 1.0581526716744893, 1.5854195824122916,
        1.4519522547520485, 0.8817459917627732, 0.8817459917627732,
        0.8817459917627732, 1.0, 1.037330992404908, 1.0581526716744893, 1.0
    ]
    colmeans2 = np.sum(matrix1, axis=0).tolist()[0]
    assert len(expected2) == len(colmeans2)
    for gotVal, expectedVal in zip(colmeans2, expected2):
        assert round(gotVal, 8) == round(
            expectedVal,
            8)  # Check rounded values (for floating point comparison issue)
def test_candidatebuilder_simple():
	text = '<drug id="1">Erlotinib</drug> is a common treatment for <cancer id="2">NSCLC</cancer>. <drug id="3">Aspirin</drug> is the main cause of <disease id="4">boneitis</disease>. <relation type="treats" subj="1" obj="2" />'

	corpus = kindred.Corpus(text,loadFromSimpleTag=True)

	parser = kindred.Parser()
	parser.parse(corpus)
	
	candidateBuilder = kindred.CandidateBuilder()
	candidateRelations = candidateBuilder.build(corpus)
	
	assert len(candidateRelations) == 4
	
	for cr in candidateRelations:
		assert isinstance(cr, kindred.CandidateRelation)
		assert len(cr.entities) == 2

	assert candidateRelations[0].entities[0].sourceEntityID == '1'
	assert candidateRelations[0].entities[1].sourceEntityID == '2'
	assert candidateRelations[1].entities[0].sourceEntityID == '2'
	assert candidateRelations[1].entities[1].sourceEntityID == '1'
	assert candidateRelations[2].entities[0].sourceEntityID == '3'
	assert candidateRelations[2].entities[1].sourceEntityID == '4'
	assert candidateRelations[3].entities[0].sourceEntityID == '4'
	assert candidateRelations[3].entities[1].sourceEntityID == '3'

	assert candidateRelations[0].knownTypesAndArgNames == []
	assert candidateRelations[1].knownTypesAndArgNames == [('treats',['obj','subj'])]
	assert candidateRelations[2].knownTypesAndArgNames == []
	assert candidateRelations[3].knownTypesAndArgNames == []
def test_candidatebuilder_triple():
    text = '<drug id="1">Erlotinib</drug> is a common treatment for <cancer id="2">NSCLC</cancer> which targets <gene id="3">EGFR</gene>. <relation type="druginfo" drug="1" disease="2" gene="3" />'

    corpus = kindred.Corpus(text, loadFromSimpleTag=True)

    candidateBuilder = kindred.CandidateBuilder(entityCount=3)
    candidateBuilder.fit_transform(corpus)

    assert corpus.relationTypes == [('druginfo', 'disease', 'drug', 'gene')]
    candidateRelations = corpus.getCandidateRelations(3)
    candidateClasses = corpus.getCandidateClasses(3)

    assert candidateClasses == [[0], [0], [1], [0], [0], [0]]
    assert len(candidateRelations) == 6

    sourceEntityIDsToEntityIDs = corpus.documents[
        0].getSourceEntityIDsToEntityIDs()

    assert candidateRelations[0].entityIDs == [
        sourceEntityIDsToEntityIDs['1'], sourceEntityIDsToEntityIDs['2'],
        sourceEntityIDsToEntityIDs['3']
    ]
    assert candidateRelations[1].entityIDs == [
        sourceEntityIDsToEntityIDs['1'], sourceEntityIDsToEntityIDs['3'],
        sourceEntityIDsToEntityIDs['2']
    ]
    assert candidateRelations[2].entityIDs == [
        sourceEntityIDsToEntityIDs['2'], sourceEntityIDsToEntityIDs['1'],
        sourceEntityIDsToEntityIDs['3']
    ]
    assert candidateRelations[3].entityIDs == [
        sourceEntityIDsToEntityIDs['2'], sourceEntityIDsToEntityIDs['3'],
        sourceEntityIDsToEntityIDs['1']
    ]
    assert candidateRelations[4].entityIDs == [
        sourceEntityIDsToEntityIDs['3'], sourceEntityIDsToEntityIDs['1'],
        sourceEntityIDsToEntityIDs['2']
    ]
    assert candidateRelations[5].entityIDs == [
        sourceEntityIDsToEntityIDs['3'], sourceEntityIDsToEntityIDs['2'],
        sourceEntityIDsToEntityIDs['1']
    ]
def test_candidatebuilder_triple():
	text = '<drug id="1">Erlotinib</drug> is a common treatment for <cancer id="2">NSCLC</cancer> which targets <gene id="3">EGFR</gene>. <relation type="druginfo" drug="1" disease="2" gene="3" />'

	corpus = kindred.Corpus(text,loadFromSimpleTag=True)

	parser = kindred.Parser()
	parser.parse(corpus)
	
	candidateBuilder = kindred.CandidateBuilder(entityCount=3)
	candidateRelations = candidateBuilder.build(corpus)
	
	#assert corpus.relationTypes == [('druginfo', 'disease', 'drug', 'gene')]
	for cr in candidateRelations:
		assert isinstance(cr, kindred.CandidateRelation)
		assert len(cr.entities) == 3

	assert candidateRelations[0].entities[0].sourceEntityID == '1'
	assert candidateRelations[0].entities[1].sourceEntityID == '2'
	assert candidateRelations[0].entities[2].sourceEntityID == '3'
	assert candidateRelations[1].entities[0].sourceEntityID == '1'
	assert candidateRelations[1].entities[1].sourceEntityID == '3'
	assert candidateRelations[1].entities[2].sourceEntityID == '2'
	assert candidateRelations[2].entities[0].sourceEntityID == '2'
	assert candidateRelations[2].entities[1].sourceEntityID == '1'
	assert candidateRelations[2].entities[2].sourceEntityID == '3'
	assert candidateRelations[3].entities[0].sourceEntityID == '2'
	assert candidateRelations[3].entities[1].sourceEntityID == '3'
	assert candidateRelations[3].entities[2].sourceEntityID == '1'
	assert candidateRelations[4].entities[0].sourceEntityID == '3'
	assert candidateRelations[4].entities[1].sourceEntityID == '1'
	assert candidateRelations[4].entities[2].sourceEntityID == '2'
	assert candidateRelations[5].entities[0].sourceEntityID == '3'
	assert candidateRelations[5].entities[1].sourceEntityID == '2'
	assert candidateRelations[5].entities[2].sourceEntityID == '1'

	assert candidateRelations[0].knownTypesAndArgNames == []
	assert candidateRelations[1].knownTypesAndArgNames == []
	assert candidateRelations[2].knownTypesAndArgNames == [('druginfo',['disease', 'drug', 'gene'])]
	assert candidateRelations[3].knownTypesAndArgNames == []
	assert candidateRelations[4].knownTypesAndArgNames == []
	assert candidateRelations[5].knownTypesAndArgNames == []
Exemple #15
0
def test_simpleVectorizer_triple():
	text = '<drug id="1">Erlotinib</drug> is a common treatment for <cancer id="2">NSCLC</cancer> which targets <gene id="3">EGFR</gene>. <relation type="druginfo" drug="1" disease="2" gene="3" />'

	corpus = kindred.Corpus(text,loadFromSimpleTag=True)
		
	parser = kindred.Parser()
	parser.parse(corpus)
	
	candidateBuilder = kindred.CandidateBuilder(entityCount=3)
	candidateRelations = candidateBuilder.build(corpus)
	
	# We'll just get the vectors for the entityTypes
	vectorizer = kindred.Vectorizer(entityCount=3,featureChoice=["entityTypes"])
	vectors = vectorizer.fit_transform(candidateRelations)
	vectorsCSR = vectors.tocsr()
	rows,cols = vectors.nonzero()

	expected = {(0, 1): 1.0, (0, 3): 1.0, (0, 8): 1.0, (1, 1): 1.0, (1, 5): 1.0, (1, 6): 1.0, (2, 0): 1.0, (2, 4): 1.0, (2, 8): 1.0, (3, 0): 1.0, (3, 5): 1.0, (3, 7): 1.0, (4, 2): 1.0, (4, 4): 1.0, (4, 6): 1.0, (5, 2): 1.0, (5, 3): 1.0, (5, 7): 1.0}

	namedCols = { str((r,c)):vectorsCSR[r,c] for r,c in zip(rows.tolist(),cols.tolist()) }

	check('test_simpleVectorizer_triple',namedCols)
Exemple #16
0
def test_simpleVectorizer_binary():
	text = '<drug id="1">Erlotinib</drug> is a common treatment for <cancer id="2">NSCLC</cancer>. <drug id="3">Aspirin</drug> is the main cause of <disease id="4">boneitis</disease> . <relation type="treats" subj="1" obj="2" />'

	corpus = kindred.Corpus(text,loadFromSimpleTag=True)

	parser = kindred.Parser()
	parser.parse(corpus)
	
	candidateBuilder = kindred.CandidateBuilder()
	candidateRelations = candidateBuilder.build(corpus)
	
	# We'll just get the vectors for the entityTypes
	vectorizer = kindred.Vectorizer(featureChoice=["entityTypes"])
	vectors = vectorizer.fit_transform(candidateRelations)
	vectorsCSR = vectors.tocsr()
	rows,cols = vectors.nonzero()

	expected = {(0, 2): 1.0, (0, 3): 1.0, (1, 0): 1.0, (1, 5): 1.0, (2, 2): 1.0, (2, 4): 1.0, (3, 1): 1.0, (3, 5): 1.0}

	namedCols = { str((r,c)):vectorsCSR[r,c] for r,c in zip(rows.tolist(),cols.tolist()) }

	check('test_simpleVectorizer_binary',namedCols)
Exemple #17
0
def test_vectorizer_dependencyPathEdges_noTFIDF():
    corpus1, _ = generateTestData(positiveCount=5, negativeCount=5)
    corpus2, _ = generateTestData(positiveCount=10, negativeCount=10)

    parser = kindred.Parser()
    parser.parse(corpus1)
    parser.parse(corpus2)

    candidateBuilder = kindred.CandidateBuilder()
    candidateRelations1 = candidateBuilder.build(corpus1)
    candidateRelations2 = candidateBuilder.build(corpus2)

    chosenFeatures = ["dependencyPathEdges"]
    vectorizer = kindred.Vectorizer(featureChoice=chosenFeatures, tfidf=False)

    matrix1 = vectorizer.fit_transform(candidateRelations1)
    matrix2 = vectorizer.transform(candidateRelations2)

    assert matrix1.shape == (8, 5)
    assert matrix2.shape == (18, 5)

    colnames = vectorizer.getFeatureNames()
    expectedNames = [
        'dependencypathelements_attr', 'dependencypathelements_nsubj',
        'dependencypathelements_nsubjpass', 'dependencypathelements_pobj',
        'dependencypathelements_prep'
    ]
    assert colnames == expectedNames

    # As a quick check, we'll confirm that the column means are as expected
    colmeans1 = np.sum(matrix1, axis=0)
    assert colmeans1.tolist() == [[4.0, 4.0, 2.0, 10.0, 8.0]]

    # As a quick check, we'll confirm that the column means are as expected
    colmeans2 = np.sum(matrix2, axis=0)
    assert colmeans2.tolist() == [[4.0, 10.0, 2.0, 10.0, 10.0]]
Exemple #18
0
def test_vectorizer_defaults_triple():
    corpus1, _ = generateTestData(entityCount=3,
                                  positiveCount=5,
                                  negativeCount=5)
    corpus2, _ = generateTestData(entityCount=3,
                                  positiveCount=10,
                                  negativeCount=10)

    parser = kindred.Parser()
    parser.parse(corpus1)
    parser.parse(corpus2)

    candidateBuilder = kindred.CandidateBuilder(entityCount=3)
    candidateRelations1 = candidateBuilder.build(corpus1)
    candidateRelations2 = candidateBuilder.build(corpus2)

    vectorizer = kindred.Vectorizer(entityCount=3)

    matrix1 = vectorizer.fit_transform(candidateRelations1)
    matrix2 = vectorizer.transform(candidateRelations2)

    assert matrix1.shape == (18, 101)
    assert matrix2.shape == (60, 101)

    colnames = vectorizer.getFeatureNames()
    expectedNames = [
        'selectedtokentypes_0_disease', 'selectedtokentypes_0_drug',
        'selectedtokentypes_0_gene', 'selectedtokentypes_1_disease',
        'selectedtokentypes_1_drug', 'selectedtokentypes_1_gene',
        'selectedtokentypes_2_disease', 'selectedtokentypes_2_drug',
        'selectedtokentypes_2_gene', 'ngrams_betweenentities_0_1_and',
        'ngrams_betweenentities_0_1_be', 'ngrams_betweenentities_0_1_by',
        'ngrams_betweenentities_0_1_can',
        'ngrams_betweenentities_0_1_fvdxdietdx',
        'ngrams_betweenentities_0_1_inhibition',
        'ngrams_betweenentities_0_1_knetvjnjun',
        'ngrams_betweenentities_0_1_targets',
        'ngrams_betweenentities_0_1_treated',
        'ngrams_betweenentities_0_1_treats',
        'ngrams_betweenentities_0_1_using',
        'ngrams_betweenentities_0_1_zkrkzlyfef',
        'ngrams_betweenentities_0_2_and', 'ngrams_betweenentities_0_2_be',
        'ngrams_betweenentities_0_2_by', 'ngrams_betweenentities_0_2_can',
        'ngrams_betweenentities_0_2_fvdxdietdx',
        'ngrams_betweenentities_0_2_inhibition',
        'ngrams_betweenentities_0_2_knetvjnjun',
        'ngrams_betweenentities_0_2_targets',
        'ngrams_betweenentities_0_2_treated',
        'ngrams_betweenentities_0_2_treats',
        'ngrams_betweenentities_0_2_using',
        'ngrams_betweenentities_0_2_zkrkzlyfef',
        'ngrams_betweenentities_1_2_and', 'ngrams_betweenentities_1_2_be',
        'ngrams_betweenentities_1_2_by', 'ngrams_betweenentities_1_2_can',
        'ngrams_betweenentities_1_2_fvdxdietdx',
        'ngrams_betweenentities_1_2_inhibition',
        'ngrams_betweenentities_1_2_knetvjnjun',
        'ngrams_betweenentities_1_2_targets',
        'ngrams_betweenentities_1_2_treated',
        'ngrams_betweenentities_1_2_treats',
        'ngrams_betweenentities_1_2_using',
        'ngrams_betweenentities_1_2_zkrkzlyfef', 'bigrams_and_targets',
        'bigrams_be_treated', 'bigrams_by_fvdxdietdx', 'bigrams_by_zkrkzlyfef',
        'bigrams_can_be', 'bigrams_elvptnpvyc_.',
        'bigrams_fvdxdietdx_inhibition', 'bigrams_hxlfssirgk_.',
        'bigrams_inhibition_using', 'bigrams_knetvjnjun_and',
        'bigrams_kyekjnkrfo_can', 'bigrams_oxzbaapqct_treats',
        'bigrams_targets_hxlfssirgk', 'bigrams_treated_by',
        'bigrams_treats_knetvjnjun', 'bigrams_usckfljzxu_.',
        'bigrams_using_elvptnpvyc', 'bigrams_using_usckfljzxu',
        'bigrams_zgwivlcmly_can', 'bigrams_zkrkzlyfef_inhibition',
        'dependencypathelements_0_1_acl', 'dependencypathelements_0_1_advmod',
        'dependencypathelements_0_1_agent',
        'dependencypathelements_0_1_compound',
        'dependencypathelements_0_1_conj', 'dependencypathelements_0_1_dobj',
        'dependencypathelements_0_1_nsubjpass',
        'dependencypathelements_0_1_pobj', 'dependencypathelements_0_2_acl',
        'dependencypathelements_0_2_advmod',
        'dependencypathelements_0_2_agent',
        'dependencypathelements_0_2_compound',
        'dependencypathelements_0_2_conj', 'dependencypathelements_0_2_dobj',
        'dependencypathelements_0_2_nsubjpass',
        'dependencypathelements_0_2_pobj', 'dependencypathelements_1_2_acl',
        'dependencypathelements_1_2_advmod',
        'dependencypathelements_1_2_agent',
        'dependencypathelements_1_2_compound',
        'dependencypathelements_1_2_conj', 'dependencypathelements_1_2_dobj',
        'dependencypathelements_1_2_nsubjpass',
        'dependencypathelements_1_2_pobj',
        'dependencypathnearselectedtoken_0_compound',
        'dependencypathnearselectedtoken_0_conj',
        'dependencypathnearselectedtoken_0_dobj',
        'dependencypathnearselectedtoken_0_nsubjpass',
        'dependencypathnearselectedtoken_1_compound',
        'dependencypathnearselectedtoken_1_conj',
        'dependencypathnearselectedtoken_1_dobj',
        'dependencypathnearselectedtoken_1_nsubjpass',
        'dependencypathnearselectedtoken_2_compound',
        'dependencypathnearselectedtoken_2_conj',
        'dependencypathnearselectedtoken_2_dobj',
        'dependencypathnearselectedtoken_2_nsubjpass'
    ]
    assert colnames == expectedNames

    # As a quick check, we'll confirm that the column means are as expected
    expected1 = [
        6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 1.4620174403662662,
        2.089911361057128, 2.089911361057128, 2.089911361057128,
        0.8510011029330441, 2.089911361057128, 0.8909306965737043,
        1.4620174403662662, 2.089911361057128, 1.4620174403662662,
        2.089911361057128, 0.8510011029330441, 1.4620174403662662,
        2.089911361057128, 2.089911361057128, 2.089911361057128,
        0.8510011029330442, 2.089911361057128, 0.8909306965737043,
        1.4620174403662662, 2.089911361057128, 1.4620174403662662,
        2.089911361057128, 0.8510011029330441, 1.4620174403662662,
        2.089911361057128, 2.089911361057128, 2.089911361057128,
        0.8510011029330442, 2.089911361057128, 0.8909306965737043,
        1.4620174403662662, 2.089911361057128, 1.4620174403662662,
        2.089911361057128, 0.8510011029330441, 2.4494897427831783,
        3.151972689633972, 2.283202494909358, 2.283202494909358,
        3.151972689633972, 2.283202494909358, 2.283202494909358,
        2.4494897427831783, 3.151972689633972, 2.4494897427831783,
        2.283202494909358, 2.4494897427831783, 2.4494897427831783,
        3.151972689633972, 2.4494897427831783, 2.283202494909358,
        2.283202494909358, 2.283202494909358, 2.283202494909358,
        2.283202494909358, 8.0, 4.0, 8.0, 8.0, 4.0, 12.0, 8.0, 8.0, 8.0, 4.0,
        8.0, 8.0, 4.0, 12.0, 8.0, 8.0, 8.0, 4.0, 8.0, 8.0, 4.0, 12.0, 8.0, 8.0,
        4.0, 2.0, 2.0, 4.0, 4.0, 2.0, 2.0, 4.0, 4.0, 2.0, 2.0, 4.0
    ]
    colmeans1 = np.sum(matrix1, axis=0).tolist()[0]
    assert len(expected1) == len(colmeans1)
    for gotVal, expectedVal in zip(colmeans1, expected1):
        assert round(gotVal, 8) == round(
            expectedVal,
            8)  # Check rounded values (for floating point comparison issue)

    # As a quick check, we'll confirm that the column means are as expected
    expected2 = [
        8.0, 20.0, 20.0, 8.0, 20.0, 20.0, 8.0, 20.0, 20.0, 5.237574707711817,
        0.0, 0.0, 0.0, 3.9169357592886755, 0.0, 0.8909306965737043,
        18.69384731218667, 0.0, 3.095010602221718, 0.0, 0.0, 5.237574707711817,
        0.0, 0.0, 0.0, 3.9169357592886755, 0.0, 0.8909306965737043,
        18.69384731218667, 0.0, 3.095010602221718, 0.0, 0.0, 5.237574707711817,
        0.0, 0.0, 0.0, 3.9169357592886755, 0.0, 0.8909306965737043,
        18.69384731218667, 0.0, 3.095010602221718, 0.0, 0.0, 8.449489742783179,
        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.4494897427831783, 0.0,
        2.4494897427831783, 0.0, 2.4494897427831783, 2.4494897427831783, 0.0,
        2.4494897427831783, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 4.0, 0.0, 12.0, 20.0,
        28.0, 8.0, 28.0, 0.0, 4.0, 0.0, 12.0, 20.0, 28.0, 8.0, 28.0, 0.0, 4.0,
        0.0, 12.0, 20.0, 28.0, 8.0, 28.0, 4.0, 10.0, 2.0, 4.0, 4.0, 10.0, 2.0,
        4.0, 4.0, 10.0, 2.0, 4.0
    ]
    colmeans2 = np.sum(matrix2, axis=0).tolist()[0]
    assert len(expected2) == len(colmeans2)
    for gotVal, expectedVal in zip(colmeans2, expected2):
        assert round(gotVal, 8) == round(
            expectedVal,
            8)  # Check rounded values (for floating point comparison issue)
			doc = kindred.Document(sentence["sentence"],metadata=metadata)
			corpus.addDocument(doc)

		#if i > 100:
		#	break
	print("%s : corpus loaded" % now())

	parser = kindred.Parser()
	parser.parse(corpus)
	print("%s : parsed" % now())

	ner = kindred.EntityRecognizer(lookup=termLookup,detectVariants=False,detectFusionGenes=False,detectMicroRNA=False,acronymDetectionForAmbiguity=True,mergeTerms=True,removePathways=True)
	ner.annotate(corpus)
	print("%s : ner" % now())

	candidateBuilder = kindred.CandidateBuilder(entityCount=2,acceptedEntityTypes=[('keyword','geneOrProtein')])
	unfilteredCandidateRelations = candidateBuilder.build(corpus)
	print("%s : candidateBuilder (%d)" % (now(),len(unfilteredCandidateRelations)))

	entityIDToDoc = {}
	for doc in corpus.documents:
		for entity in doc.entities:
			entityIDToDoc[entity.entityID] = doc

	candidateRelations,metadata = [],[]
	for cr in unfilteredCandidateRelations:
		entityIDToEntity = { entity.entityID:entity for entity,tokenIndices in cr.sentence.entityAnnotations }
		entityIDToTokenLocs = { entity.entityID:tokenIndices for entity,tokenIndices in cr.sentence.entityAnnotations }
		keyword,geneOrProtein = [ entityIDToEntity[entityID] for entityID in cr.entityIDs ]
		keywordLoc,geneOrProteinLoc = [ entityIDToTokenLocs[entityID] for entityID in cr.entityIDs ]
Exemple #20
0
    sentenceCorpus = corpus.splitIntoSentences()

    print("Looking for measurement words, e.g. voltage")
    wordlist = {
        ('voltage', ): {('measurement', 'voltage')},
        ('current', ): {('measurement', 'current')}
    }

    entityRecognizer = kindred.EntityRecognizer(wordlist)
    entityRecognizer.annotate(sentenceCorpus)

    print("Looking for numeric values")
    quantityRecognizer = QuantityRecognizer()
    quantityRecognizer.annotate(sentenceCorpus)

    print("Find every pair of a measurement word and a value")
    candidateBuilder = kindred.CandidateBuilder(
        acceptedEntityTypes=[('measurement', 'quantity')])
    candidateRelations = candidateBuilder.build(sentenceCorpus)

    print("Let's annotate a few")
    withRelations, noRelations = kindred.manuallyAnnotate(
        sentenceCorpus, candidateRelations)

    outDir = 'numericalAnnotations'
    if not os.path.isdir(outDir):
        os.makedirs(outDir)

    print("Saving results to directory...")
    kindred.save(withRelations, 'standoff', outDir)
Exemple #21
0
    assert len(
        wordlistDict
    ) == 2, "This annotation tool currently only handles two entity relations of different types"

    wordlistLookup = kindred.EntityRecognizer.loadWordlists(wordlistDict,
                                                            idColumn=0,
                                                            termsColumn=0)

    print("Annotating entities in corpus with wordlists")
    entityRecognizer = kindred.EntityRecognizer(wordlistLookup)
    entityRecognizer.annotate(sentenceCorpus)

    print("Finding all candidate relations")
    acceptedEntityTypes = wordlistDict
    candidateBuilder = kindred.CandidateBuilder(
        entityCount=len(wordlistDict),
        acceptedEntityTypes=[tuple(sorted(wordlistDict.keys()))])
    candidateRelations = candidateBuilder.build(sentenceCorpus)

    print(
        "Time to through some of the candidate relations and annotate some...")
    annotatedCorpus, unannotatedCorpus = kindred.manuallyAnnotate(
        sentenceCorpus, candidateRelations)

    print(
        "\nSaving annotated corpus of %d sentences (with relations that you have just annotated)"
        % len(annotatedCorpus.documents))
    kindred.save(annotatedCorpus, 'standoff', annotatedDir)

    print(
        "Saving unannotated corpus of %d sentences (which you did not review)"
Exemple #22
0
def test_vectorizer_bigrams():
    corpus1, _ = generateTestData(positiveCount=5, negativeCount=5)
    corpus2, _ = generateTestData(positiveCount=10, negativeCount=10)

    parser = kindred.Parser()
    parser.parse(corpus1)
    parser.parse(corpus2)

    candidateBuilder = kindred.CandidateBuilder()
    candidateRelations1 = candidateBuilder.build(corpus1)
    candidateRelations2 = candidateBuilder.build(corpus2)

    chosenFeatures = ["bigrams"]
    vectorizer = kindred.Vectorizer(featureChoice=chosenFeatures, tfidf=True)

    matrix1 = vectorizer.fit_transform(candidateRelations1)
    matrix2 = vectorizer.transform(candidateRelations2)

    assert matrix1.shape == (8, 27)
    assert matrix2.shape == (18, 27)

    colnames = vectorizer.getFeatureNames()
    expectedNames = [
        u'bigrams_ _gnorcyvmer', u'bigrams_a_common', u'bigrams_a_known',
        u'bigrams_be_treated', u'bigrams_bmzvpvwbpw_failed', u'bigrams_can_be',
        u'bigrams_clinical_trials', u'bigrams_common_treatment',
        u'bigrams_effect_of', u'bigrams_failed_clinical',
        u'bigrams_for_kyekjnkrfo', u'bigrams_for_zgwivlcmly',
        u'bigrams_gnorcyvmer_is', u'bigrams_is_a', u'bigrams_known_side',
        u'bigrams_kyekjnkrfo_.', u'bigrams_of_ruswdgzajr',
        u'bigrams_ootopaoxbg_can', u'bigrams_pehhjnlvvewbjccovflf_is',
        u'bigrams_ruswdgzajr_.', u'bigrams_side_effect',
        u'bigrams_treated_with', u'bigrams_treatment_for',
        u'bigrams_trials_for', u'bigrams_vgypkemhjr_.',
        u'bigrams_with_vgypkemhjr', u'bigrams_zgwivlcmly_.'
    ]
    assert colnames == expectedNames

    # As a quick check, we'll confirm that the column means are as expected
    expected1 = [
        0.6830902801437798, 0.7801302536256829, 0.6830902801437798,
        0.8164965809277259, 0.8164965809277259, 0.8164965809277259,
        0.8164965809277259, 0.7801302536256829, 0.6830902801437798,
        0.8164965809277259, 0.8164965809277259, 0.7801302536256829,
        0.6830902801437798, 1.1070563456981333, 0.6830902801437798,
        0.8164965809277259, 0.6830902801437798, 0.8164965809277259,
        0.7801302536256829, 0.6830902801437798, 0.6830902801437798,
        0.8164965809277259, 0.7801302536256829, 0.8164965809277259,
        0.8164965809277259, 0.8164965809277259, 0.7801302536256829
    ]
    colmeans1 = np.sum(matrix1, axis=0).tolist()[0]
    assert len(expected1) == len(colmeans1)
    for gotVal, expectedVal in zip(colmeans1, expected1):
        assert round(gotVal, 8) == round(
            expectedVal,
            8)  # Check rounded values (for floating point comparison issue)

    # As a quick check, we'll confirm that the column means are as expected
    expected2 = [
        0.0, 1.0581526716744893, 0.0, 0.8164965809277259, 1.0,
        0.8164965809277259, 2.1547005383792515, 1.0581526716744893, 0.0,
        2.1547005383792515, 0.0, 0.0, 0.0, 0.8005865164268136, 0.0, 2.0, 0.0,
        0.8164965809277259, 2.0, 0.0, 0.0, 0.8164965809277259,
        1.0581526716744893, 2.1547005383792515, 0.8164965809277259,
        0.8164965809277259, 0.0
    ]
    colmeans2 = np.sum(matrix2, axis=0).tolist()[0]
    assert len(expected2) == len(colmeans2)
    for gotVal, expectedVal in zip(colmeans2, expected2):
        assert round(gotVal, 8) == round(
            expectedVal,
            8)  # Check rounded values (for floating point comparison issue)