def index_dump(collection_name, filename, profile, shards, skip, solr='http://localhost:8983/solr/'): """ Indexes a Wikidata dump in a new Solr collection with the given name. """ tagger = TaggerFactory(solr) indexing_profile = IndexingProfile.load(profile) try: tagger.create_collection(collection_name, num_shards=shards, configset=indexing_profile.solrconfig) except CollectionAlreadyExists: pass dump = WikidataDumpReader(filename) tagger.index_stream(collection_name, dump, indexing_profile, batch_size=2000, commit_time=10, delete_excluded=False, skip_docs=skip)
def train_from_dump(cls, filename): """ Trains a bag of words language model from either a .txt file (in which case it is read as plain text) or a .json.bz2 file (in which case it is read as a wikidata dump). """ bow = BOWLanguageModel() if filename.endswith('.txt'): with open(filename, 'r') as f: for line in f: bow.ingest_phrases([line.strip()]) elif filename.endswith('.json.bz2'): with WikidataDumpReader(filename) as reader: for idx, item in enumerate(reader): if idx % 10000 == 0: print(idx) enlabel = item.get('labels', {}).get('en', {}).get('value') endesc = item.get('descriptions', {}).get('en', {}).get('value') if enlabel: # Fetch aliases enaliases = [ alias['value'] for alias in item.get('aliases', {}).get('en', []) ] bow.ingest_phrases(enaliases + [enlabel]) else: raise ValueError( 'invalid filename provided (must end in .txt or .json.bz2)') return bow
def test_read_dump(self): count = 0 entity_ids = re.compile(r'[QPL]\d+') with WikidataDumpReader(self.dump_fname) as reader: for item in reader: count += 1 assert entity_ids.match(item.get('id')) is not None assert count == 100
def test_all_items_profile(testdir): profile_filename = os.path.join(testdir, 'data/all_items_profile.json') profile = IndexingProfile.load(profile_filename) type_matcher = TypeMatcherStub() dump_filename = os.path.join(testdir, 'data/sample_wikidata_items.json.bz2') with WikidataDumpReader(dump_filename) as reader: for item in reader: assert profile.entity_to_document(item, type_matcher) is not None
def test_index_dump(self): try: self.tf.create_collection('wd_test_collection') dump = WikidataDumpReader(os.path.join(self.testdir, 'data/sample_wikidata_items.json.bz2')) self.tf.index_stream('wd_test_collection', dump, self.profile, batch_size=20, commit_time=2) resp = self.tag_sentence("I live in Vanuatu") self.assertEqual(['startOffset', 10, 'endOffset', 17, 'ids', ['Q686']], resp['tags'][0]) finally: self.tf.delete_collection('wd_test_collection')
def test_index_stream(self): try: self.tf.create_collection('wd_test_collection') # We use a dump reader but this was actually obtained from a stream! dump = WikidataDumpReader(os.path.join(self.testdir, 'data/short_stream.json.bz2')) self.tf.index_stream('wd_test_collection', dump, self.profile, batch_size=50, commit_time=2, delete_excluded=True) resp = self.tag_sentence("Yesterday I met Ryszard Adam Bobrowski.") self.assertEqual(['startOffset', 16, 'endOffset', 38, 'ids', ['Q24428424']], resp['tags'][0]) finally: self.tf.delete_collection('wd_test_collection')
def setUpClass(cls): cls.testdir = os.path.dirname(os.path.abspath(__file__)) # Load dummy bow bow_fname = os.path.join(cls.testdir, 'data/sample_bow.pkl') cls.bow = BOWLanguageModel() cls.bow.load(bow_fname) # Load dummy graph graph_fname = os.path.join(cls.testdir, 'data/sample_wikidata_items.npz') pagerank_fname = os.path.join(cls.testdir, 'data/sample_wikidata_items.pgrank.npy') cls.graph = WikidataGraph() cls.graph.load_from_matrix(graph_fname) cls.graph.load_pagerank(pagerank_fname) # Load dummy profile cls.profile = IndexingProfile.load( os.path.join(cls.testdir, 'data/all_items_profile.json')) # Setup solr index (TODO delete this) and tagger cls.tf = TaggerFactory() cls.collection_name = 'wd_test_collection' try: cls.tf.create_collection(cls.collection_name) except CollectionAlreadyExists: pass cls.tf.index_stream( cls.collection_name, WikidataDumpReader( os.path.join(cls.testdir, 'data/sample_wikidata_items.json.bz2')), cls.profile) cls.tagger = Tagger(cls.collection_name, cls.bow, cls.graph) # Load NIF dataset cls.nif = NIFCollection.load( os.path.join(cls.testdir, 'data/five-affiliations.ttl')) cls.classifier = SimpleTagClassifier(cls.tagger, max_similarity_distance=10, similarity_smoothing=2)
def setUpClass(cls): super(TaggerTest, cls).tearDownClass() testdir = os.path.dirname(os.path.abspath(__file__)) # Load dummy bow bow_fname = os.path.join(testdir, 'data/sample_bow.pkl') cls.bow = BOWLanguageModel() cls.bow.load(bow_fname) # Load dummy graph graph_fname = os.path.join(testdir, 'data/sample_wikidata_items.npz') pagerank_fname = os.path.join(testdir, 'data/sample_wikidata_items.pgrank.npy') cls.graph = WikidataGraph() cls.graph.load_from_matrix(graph_fname) cls.graph.load_pagerank(pagerank_fname) # Load indexing profile cls.profile = IndexingProfile.load( os.path.join(testdir, 'data/all_items_profile.json')) # Setup solr index cls.tf = TaggerFactory() cls.collection_name = 'wd_test_collection' try: cls.tf.delete_collection('wd_test_collection') except requests.exceptions.RequestException: pass cls.tf.create_collection(cls.collection_name) cls.tf.index_stream( 'wd_test_collection', WikidataDumpReader( os.path.join(testdir, 'data/sample_wikidata_items.json.bz2')), cls.profile) cls.sut = Tagger(cls.collection_name, cls.bow, cls.graph)