コード例 #1
0
def index_dump(collection_name,
               filename,
               profile,
               shards,
               skip,
               solr='http://localhost:8983/solr/'):
    """
    Indexes a Wikidata dump in a new Solr collection with the given name.
    """
    tagger = TaggerFactory(solr)
    indexing_profile = IndexingProfile.load(profile)
    try:
        tagger.create_collection(collection_name,
                                 num_shards=shards,
                                 configset=indexing_profile.solrconfig)
    except CollectionAlreadyExists:
        pass
    dump = WikidataDumpReader(filename)
    tagger.index_stream(collection_name,
                        dump,
                        indexing_profile,
                        batch_size=2000,
                        commit_time=10,
                        delete_excluded=False,
                        skip_docs=skip)
コード例 #2
0
ファイル: languagemodel.py プロジェクト: ziodave/opentapioca
    def train_from_dump(cls, filename):
        """
        Trains a bag of words language model from either a .txt
        file (in which case it is read as plain text) or a .json.bz2
        file (in which case it is read as a wikidata dump).
        """
        bow = BOWLanguageModel()
        if filename.endswith('.txt'):
            with open(filename, 'r') as f:
                for line in f:
                    bow.ingest_phrases([line.strip()])

        elif filename.endswith('.json.bz2'):
            with WikidataDumpReader(filename) as reader:
                for idx, item in enumerate(reader):
                    if idx % 10000 == 0:
                        print(idx)

                    enlabel = item.get('labels', {}).get('en', {}).get('value')
                    endesc = item.get('descriptions', {}).get('en',
                                                              {}).get('value')
                    if enlabel:
                        # Fetch aliases
                        enaliases = [
                            alias['value']
                            for alias in item.get('aliases', {}).get('en', [])
                        ]

                        bow.ingest_phrases(enaliases + [enlabel])
        else:
            raise ValueError(
                'invalid filename provided (must end in .txt or .json.bz2)')

        return bow
コード例 #3
0
 def test_read_dump(self):
     count = 0
     entity_ids = re.compile(r'[QPL]\d+')
     with WikidataDumpReader(self.dump_fname) as reader:
         for item in reader:
             count += 1
             assert entity_ids.match(item.get('id')) is not None
     assert count == 100
コード例 #4
0
def test_all_items_profile(testdir):
    profile_filename = os.path.join(testdir, 'data/all_items_profile.json')
    profile = IndexingProfile.load(profile_filename)
    type_matcher = TypeMatcherStub()
    dump_filename = os.path.join(testdir,
                                 'data/sample_wikidata_items.json.bz2')
    with WikidataDumpReader(dump_filename) as reader:
        for item in reader:
            assert profile.entity_to_document(item, type_matcher) is not None
コード例 #5
0
    def test_index_dump(self):
        try:
            self.tf.create_collection('wd_test_collection')
            dump = WikidataDumpReader(os.path.join(self.testdir, 'data/sample_wikidata_items.json.bz2'))
            self.tf.index_stream('wd_test_collection',
                                  dump,
                                  self.profile,
                                  batch_size=20,
                                  commit_time=2)
            

            resp = self.tag_sentence("I live in Vanuatu")
            self.assertEqual(['startOffset', 10, 'endOffset', 17, 'ids', ['Q686']], resp['tags'][0])
        finally:
            self.tf.delete_collection('wd_test_collection')
コード例 #6
0
 def test_index_stream(self):
     try:
         self.tf.create_collection('wd_test_collection')
         # We use a dump reader but this was actually obtained from a stream!   
         dump = WikidataDumpReader(os.path.join(self.testdir, 'data/short_stream.json.bz2'))
         self.tf.index_stream('wd_test_collection',
                               dump,
                               self.profile,
                               batch_size=50,
                               commit_time=2,
                               delete_excluded=True)
         
         resp = self.tag_sentence("Yesterday I met Ryszard Adam Bobrowski.")
         self.assertEqual(['startOffset', 16, 'endOffset', 38, 'ids', ['Q24428424']], resp['tags'][0])
     finally:
         self.tf.delete_collection('wd_test_collection')
         
コード例 #7
0
    def setUpClass(cls):
        cls.testdir = os.path.dirname(os.path.abspath(__file__))

        # Load dummy bow
        bow_fname = os.path.join(cls.testdir, 'data/sample_bow.pkl')
        cls.bow = BOWLanguageModel()
        cls.bow.load(bow_fname)

        # Load dummy graph
        graph_fname = os.path.join(cls.testdir,
                                   'data/sample_wikidata_items.npz')
        pagerank_fname = os.path.join(cls.testdir,
                                      'data/sample_wikidata_items.pgrank.npy')
        cls.graph = WikidataGraph()
        cls.graph.load_from_matrix(graph_fname)
        cls.graph.load_pagerank(pagerank_fname)

        # Load dummy profile
        cls.profile = IndexingProfile.load(
            os.path.join(cls.testdir, 'data/all_items_profile.json'))

        # Setup solr index (TODO delete this) and tagger
        cls.tf = TaggerFactory()
        cls.collection_name = 'wd_test_collection'
        try:
            cls.tf.create_collection(cls.collection_name)
        except CollectionAlreadyExists:
            pass
        cls.tf.index_stream(
            cls.collection_name,
            WikidataDumpReader(
                os.path.join(cls.testdir,
                             'data/sample_wikidata_items.json.bz2')),
            cls.profile)
        cls.tagger = Tagger(cls.collection_name, cls.bow, cls.graph)

        # Load NIF dataset
        cls.nif = NIFCollection.load(
            os.path.join(cls.testdir, 'data/five-affiliations.ttl'))

        cls.classifier = SimpleTagClassifier(cls.tagger,
                                             max_similarity_distance=10,
                                             similarity_smoothing=2)
コード例 #8
0
ファイル: test_tagger.py プロジェクト: ziodave/opentapioca
    def setUpClass(cls):
        super(TaggerTest, cls).tearDownClass()
        testdir = os.path.dirname(os.path.abspath(__file__))

        # Load dummy bow
        bow_fname = os.path.join(testdir, 'data/sample_bow.pkl')
        cls.bow = BOWLanguageModel()
        cls.bow.load(bow_fname)

        # Load dummy graph
        graph_fname = os.path.join(testdir, 'data/sample_wikidata_items.npz')
        pagerank_fname = os.path.join(testdir,
                                      'data/sample_wikidata_items.pgrank.npy')
        cls.graph = WikidataGraph()
        cls.graph.load_from_matrix(graph_fname)
        cls.graph.load_pagerank(pagerank_fname)

        # Load indexing profile
        cls.profile = IndexingProfile.load(
            os.path.join(testdir, 'data/all_items_profile.json'))

        # Setup solr index
        cls.tf = TaggerFactory()
        cls.collection_name = 'wd_test_collection'
        try:
            cls.tf.delete_collection('wd_test_collection')
        except requests.exceptions.RequestException:
            pass
        cls.tf.create_collection(cls.collection_name)
        cls.tf.index_stream(
            'wd_test_collection',
            WikidataDumpReader(
                os.path.join(testdir, 'data/sample_wikidata_items.json.bz2')),
            cls.profile)

        cls.sut = Tagger(cls.collection_name, cls.bow, cls.graph)