def test_load_option_for_titles_only(self):
        bibtex_content = '''
@ARTICLE{I[2],
  inserir = {true},
  title = {abobrinha1},
  year = {2008},
  abstract = {abobrinha abstract},
}
@INPROCEEDINGS{E[2],
  inserir = {false},
  title = {umbrela},
  year = {2020},
  abstract = {another abstract},
}
        '''
        file_stub = Mock()
        file_stub.__enter__ = Mock(return_value=file_stub)
        file_stub.__exit__ = Mock()
        file_stub.read = Mock(return_value=bibtex_content)

        with patch.object(codecs, 'open', return_value=file_stub) as cod:
            X, y, years = load(['pepino.bib'], titles_only=True)

        cod.assert_called_once_with('pepino.bib', 'r', encoding='utf-8')
        self.assertEqual(X, ['abobrinha1', 'umbrela'])
        self.assertEqual(y, [1, 0])
        self.assertEqual(years, [2008, 2020])
    def test_load_2_bibfiles(self):
        bibtex_content_1 = '''
@ARTICLE{I[2],
  inserir = {true},
  title = {abobrinha1},
  year = {2008},
  abstract = {abobrinha abstract},
}
@INPROCEEDINGS{E[2],
  inserir = {false},
  title = {umbrela},
  year = {2020},
  abstract = {another abstract},
}
        '''
        file_stub_1 = Mock()
        file_stub_1.__enter__ = Mock(return_value=file_stub_1)
        file_stub_1.__exit__ = Mock()
        file_stub_1.read = Mock(return_value=bibtex_content_1)
        bibtex_content_2 = '''
@ARTICLE{I[2],
  inserir = {false},
  title = {uva},
  year = {2020},
  abstract = {contos da uva},
}
@INPROCEEDINGS{E[2],
  inserir = {false},
  title = {note},
  year = {2021},
  abstract = {not cool},
}
        '''
        file_stub_2 = Mock()
        file_stub_2.__enter__ = Mock(return_value=file_stub_2)
        file_stub_2.__exit__ = Mock()
        file_stub_2.read = Mock(return_value=bibtex_content_2)
        files = [file_stub_1, file_stub_2]

        with patch.object(codecs, 'open', side_effect=files) as cod:
            X, y, years = load(['pepino.bib', 'abacaxi.bib'])

        cod.assert_has_calls([
            call('pepino.bib', 'r', encoding='utf-8'),
            call('abacaxi.bib', 'r', encoding='utf-8')
        ])
        self.assertEqual(X, [
            'abobrinha1\nabobrinha abstract', 'umbrela\nanother abstract',
            'uva\ncontos da uva', 'note\nnot cool'
        ])
        self.assertEqual(y, [1, 0, 0, 0])
        self.assertEqual(years, [2008, 2020, 2020, 2021])
    def test_load_bibitem(self):
        bibtex_content = '''
@ARTICLE{I[1],
  inserir = {true},
  title = {abobrinha1},
  year = {2008},
  abstract = {abobrinha abstract},
}
        '''
        file_stub = Mock()
        file_stub.__enter__ = Mock(return_value=file_stub)
        file_stub.__exit__ = Mock()
        file_stub.read = Mock(return_value=bibtex_content)

        with patch.object(codecs, 'open', return_value=file_stub) as cod:
            X, y, years = load(['abobrinha.bib'])

        cod.assert_called_once_with('abobrinha.bib', 'r', encoding='utf-8')
        self.assertEqual(X, ['abobrinha1\nabobrinha abstract'])
        self.assertEqual(y, [1])
        self.assertEqual(years, [2008])
Ejemplo n.º 4
0
    print('sixth argument missing: padding sequence (for embeddings only!)')
    sys.exit(1)

if (len(sys.argv) < 7):
    print(
        'seventh argument missing: extrator (tfidf,embeddings_glove,embeddings_se)'
    )
    sys.exit(1)

_, theme, classifier_name, ngram_range, titles, maxlen, extractor = sys.argv
titles = True if titles == 'true' else False
embedding_dim = 200
embedding_file = './embeddings/glove.6B.200d.txt' if classifier_name == 'embeddings_glove' or extractor == 'embeddings_glove' else './embeddings/SO_vectors_200.bin'

slr_files = get_slr_files(theme)
X, y, years = load(slr_files, titles_only=titles)

kfold = YearsSplit(n_split=3, years=years)
result = {'fscore': [], 'threashold': [], 'missed': [], 'excluded': []}

X = np.array(X)
y = np.array(y)
for train_index, test_index in kfold.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    if classifier_name[:9] != 'embedding':
        classifier, classifier_params = get_classifier(classifier_name)
        extractor_class, selector_f, selector_params = get_extractor(
            extractor, ngram_range, embedding_file)
        classifier_params.update(selector_params)