def test_list_documents(self, mocked_get): mocked_get.return_value.status_code = 200 mocked_get.return_value.json.return_value = \ {u'count': 2, u'next': None, u'previous': None, u'results': [self.example_document_1, self.example_document_2]} pypln = PyPLN(self.base_url, (self.user, self.password)) result = pypln.documents() mocked_get.assert_called_with(self.base_url + "/documents/") retrieved_document_1 = result[0] retrieved_document_2 = result[1] for key, value in self.example_document_1.items(): # `properties` is a method on `Document` class, so replacing with # `properties_url` to test each key/value if key == 'properties': key = 'properties_url' self.assertEqual(value, getattr(retrieved_document_1, key)) for key, value in self.example_document_2.items(): # `properties` is a method on `Document` class, so replacing with # `properties_url` to test each key/value if key == 'properties': key = 'properties_url' self.assertEqual(value, getattr(retrieved_document_2, key)) # Document objects should link `session` object from PyPLN self.assertIs(retrieved_document_1.session, pypln.session) self.assertIs(retrieved_document_2.session, pypln.session)
def test_create_corpus(self, mocked_post): mocked_post.return_value.status_code = 201 mocked_post.return_value.json.return_value = self.example_corpus pypln = PyPLN(self.base_url, (self.user, self.password)) result = pypln.add_corpus(**self.corpus_data) mocked_post.assert_called_with(self.base_url + "/corpora/", data=self.corpus_data) for key, value in self.example_corpus.items(): self.assertEqual(value, getattr(result, key)) # Corpus objects should link `session` object from PyPLN self.assertIs(result.session, pypln.session)
def test_list_corpora(self, mocked_get): mocked_get.return_value.status_code = 200 mocked_get.return_value.json.return_value = \ {u'count': 1, u'next': None, u'previous': None, u'results': [self.example_corpus]} pypln = PyPLN(self.base_url, (self.user, self.password)) result = pypln.corpora() mocked_get.assert_called_with(self.base_url + "/corpora/") for key, value in self.example_corpus.items(): self.assertEqual(value, getattr(result[0], key)) # Corpus objects should link `session` object from PyPLN self.assertIs(result[0].session, pypln.session)
def test_corpus_creation_fails_if_wrong_auth(self, mocked_post): mocked_post.return_value.status_code = 403 with self.assertRaises(RuntimeError): pypln = PyPLN(self.base_url, ('wrong_user', 'my_precious')) result = pypln.add_corpus(**self.corpus_data)
def main(): parser = argparse.ArgumentParser() parser.add_argument('mongodb', help='MongoDB server/db/collection (format: ' 'host:port/db/collection)') parser.add_argument('pypln', help='Main URL to PyPLN installation. Example: ' 'http://demo.pypln.org/') parser.add_argument('username', help='Username to log-in PyPLN installtion') parser.add_argument('password', help='Password to log-in PyPLN installtion') parser.add_argument('corpus', help='Name of the corpus to upload documents to ' "(if doesn't exists, will be created)") parser.add_argument('--pages-per-request', help='Number of pages to send in a single request') parser.add_argument('--max-pages', help='Maximum number of pages to send') args = parser.parse_args() if args.pages_per_request: pages_per_request = int(args.pages_per_request) else: pages_per_request = 10 mongo_config = regexp_mongodb.findall(args.mongodb) if not mongo_config: sys.stdout.write('Error: "mongodb" should be in format ' 'host:port/db/collection\n') exit(1) print 'Connecting to MongoDB...' mongo = dict(zip(('host', 'port', 'db', 'collection'), mongo_config[0])) connection = pymongo.Connection(host=mongo['host'], port=int(mongo['port']), safe=True) db = connection[mongo['db']] collection = db[mongo['collection']] print 'Logging into PyPLN at {}...'.format(args.pypln) pypln = PyPLN(args.pypln) pypln.login(args.username, args.password) print 'Selecting (or creating) corpus {}...'.format(args.corpus) corpora = pypln.corpora() find = [corpus for corpus in corpora \ if corpus.name.lower() == args.corpus.lower()] if not find: corpus = pypln.add_corpus(name=args.corpus, description='Portuguese Wikipedia') else: corpus = find[0] # fix a bug in pypln.api: corpus.url = '{}corpora/{}'.format(args.pypln, corpus.slug) print 'Uploading...' query_filter = {'uploaded': False} total = float(collection.count()) if args.max_pages: max_pages = int(args.max_pages) else: max_pages = total counter = collection.find({'uploaded': True}).count() initial_counter = counter report = '\r {:07d} / {:07d} ({:5.2f}%), {:10.3f}s ({:9.3f}p/s). ETA: {}' start_time = time.time() cursor = collection.find(query_filter, timeout=False) page_iterator = partition(cursor, pages_per_request) for pages in page_iterator: temp_files, filenames = [], [] for page in pages: temp_file = TemporaryFile() temp_file.write(page['text'].encode('utf-8')) temp_file.seek(0) temp_files.append(temp_file) filename = u'{}.txt'.format(page['title']) filenames.append(filename) corpus.add_documents(temp_files, filenames) for page in pages: collection.update({'_id': page['_id']}, {'$set': { 'uploaded': True }}) counter += len(pages) percentual = 100 * (counter / total) delta_time = time.time() - start_time rate = (counter - initial_counter) / delta_time eta = timedelta(((max_pages - counter) / rate) / (24 * 3600)) sys.stdout.write( report.format(counter, int(total), percentual, delta_time, rate, eta)) sys.stdout.flush() if max_pages and counter >= max_pages: break sys.stdout.write('\n') cursor.close()
def test_is_sending_pyplnapi_version_as_user_agent(self): pypln = PyPLN(self.base_url, (self.user, self.password)) self.assertIn('pypln.api/{}'.format(__version__), pypln.session.headers['User-Agent'])
def test_raise_an_error_if_auth_is_not_str_or_tuple(self): """If the `auth` argument is not a tuple (for basic auth) or a string (for token auth), an error should be raised.""" with self.assertRaises(TypeError): pypln = PyPLN(self.base_url, 1)
def test_token_auth_is_correctly_set(self): credentials = 'ea92019a4bdf5d1c122c58b53de3e8d36fe9ae6a' pypln = PyPLN(self.base_url, credentials) self.assertEqual(pypln.session.headers['Authorization'], 'Token {}'.format(credentials))
def test_basic_auth_is_correctly_set(self): credentials = (self.user, self.password) pypln = PyPLN(self.base_url, credentials) self.assertEqual(pypln.session.auth, credentials)
def test_listing_documents_fails_if_wrong_auth(self, mocked_get): mocked_get.return_value.status_code = 403 pypln = PyPLN(self.base_url, ('wrong_user', 'my_precious')) self.assertRaises(RuntimeError, pypln.documents)