def test_compare_equal_documents(self): document_1 = Document(session=None, **self.example_json) document_2 = Document(session=self.session, **self.example_json) # `session` object (that holds authentication information) does not # matter for equality of `Document` objects self.assertEqual(document_1, document_2)
def test_getting_specific_property_returns_an_error(self, mocked_get): mocked_get.return_value.status_code = 403 session = requests.Session() session.auth = ('wrong_user', 'my_precious') document = Document(session=session, **self.example_json) with self.assertRaises(RuntimeError): document.get_property('text')
def test_compare_documents_with_different_corpora(self): document_1 = Document(session=None, **self.example_json) json_2 = self.example_json.copy() json_2['corpus'] = "http://pypln.example.com/corpora/2/" document_2 = Document(session=None, **json_2) self.assertNotEqual(document_1, document_2)
def test_compare_documents_with_different_owners(self): document_1 = Document(session=None, **self.example_json) json_2 = self.example_json.copy() json_2['owner'] = "user_2" document_2 = Document(session=None, **json_2) self.assertNotEqual(document_1, document_2)
def test_compare_documents_with_different_upload_dates(self): document_1 = Document(session=None, **self.example_json) json_2 = self.example_json.copy() json_2['uploaded_at'] = '2013-10-29T17:00:00.000Z' document_2 = Document(session=None, **json_2) self.assertNotEqual(document_1, document_2)
def test_compare_documents_with_different_sizes(self): document_1 = Document(session=None, **self.example_json) json_2 = self.example_json.copy() json_2['size'] = 1 document_2 = Document(session=None, **json_2) self.assertNotEqual(document_1, document_2)
def test_compare_documents_with_different_urls(self): document_1 = Document(session=None, **self.example_json) json_2 = self.example_json.copy() json_2['url'] = 'http://pypln.example2.com/documents/1/' document_2 = Document(session=None, **json_2) self.assertNotEqual(document_1, document_2)
def test_get_specific_property(self, mocked_get): text = "This is a test file with some test text." mocked_get.return_value.status_code = 200 mocked_get.return_value.json.return_value = {'value': text} document = Document(session=self.session, **self.example_json) self.assertEqual(document.get_property('text'), text) mocked_get.assert_called_with(self.example_json['properties'] + 'text')
def test_instantiating_document_from_url_fails(self, mocked_get): mocked_get.return_value.status_code = 403 mocked_get.return_value.json.return_value = self.example_json url = self.example_json['url'] with self.assertRaises(RuntimeError): document = Document.from_url(url, ('wrong_user', 'my_precious'))
def test_instantiate_document_from_json(self): document = Document(session=self.session, **self.example_json) for k, v in self.example_json.items(): if k != "properties": self.assertEqual(getattr(document, k), v) self.assertIs(document.session, self.session) self.assertEqual(document.properties_url, self.example_json['properties'])
def main(): cursor = pypln_temp.find() while pypln_temp.count() > 0: for article in cursor: try: url = article['pypln_url'] my_doc = Document.from_url(url, settings.PYPLN_CREDENTIALS) _id = article['articles_id'] _id_temp = article['_id'] except RuntimeError as e: logger.error( "The document {} could not be found on the PyPLN collection: {}" .format(url, e)) continue if '_exception' in my_doc.properties: logger.warning("PyPLN found an error {}".format( article['pypln_url'])) articles.update({'_id': _id}, {'$set': {'status': 2}}) pypln_temp.remove({'_id': _id_temp}) continue if len(my_doc.properties) < 29: if 'time' in article: if (datetime.datetime.now() - article['time']).seconds / 60 > 5: logger.warning( "PyPLN could not finish the analysis {}".format( article['pypln_url'])) articles.update({'_id': _id}, {'$set': {'status': 2}}) pypln_temp.remove({'_id': _id_temp}) else: continue else: pypln_temp.update( {'_id': _id_temp}, {'$set': { 'time': datetime.datetime.now() }}) else: analysis = {'articles_id': _id} for property in my_doc.properties: analysis[property] = my_doc.get_property(property) articles_analysis.insert(analysis) articles.update({'_id': _id}, {'$set': {'status': 1}}) pypln_temp.remove({'_id': _id_temp}) cursor = pypln_temp.find() cursor.close()
def test_download_wordcloud(self, mocked_get): png = "This is not really a png.\n".encode('ascii') encoded_png = base64.b64encode(png) mocked_get.return_value.status_code = 200 mocked_get.return_value.json.return_value = {'value': encoded_png} document = Document(session=self.session, **self.example_json) import sys if sys.version < '3': builtins_module = '__builtin__' else: builtins_module = 'builtins' m = mock_open() with patch('{}.open'.format(builtins_module), m, create=True): document.download_wordcloud('test.png') m.assert_called_once_with('test.png', 'w') handle = m() handle.write.assert_called_once_with(png.decode('ascii')) mocked_get.assert_called_with(self.example_json['properties'] + 'wordcloud')
def test_instantiate_document_from_url(self, mocked_get): mocked_get.return_value.status_code = 200 mocked_get.return_value.json.return_value = self.example_json url = self.example_json['url'] document = Document.from_url(url, self.auth) mocked_get.assert_called_with(url) self.assertIsInstance(document, Document) for k, v in self.example_json.items(): if k != "properties": self.assertEqual(getattr(document, k), v) self.assertEqual(document.properties_url, self.example_json['properties']) self.assertEqual(document.session.auth, self.auth)
def test_instantiate_document_from_url(self, mocked_get): mocked_get.return_value.status_code = 200 mocked_get.return_value.json.return_value = self.example_json url = self.example_json['url'] document = Document.from_url(url, self.auth) mocked_get.assert_called_with(url) self.assertIsInstance(document, Document) for k,v in self.example_json.items(): if k != "properties": self.assertEqual(getattr(document, k), v) self.assertEqual(document.properties_url, self.example_json['properties']) self.assertEqual(document.session.auth, self.auth)
def main(): cursor = pypln_temp.find() while pypln_temp.count() > 0: for article in cursor: try: url = article['pypln_url'] my_doc = Document.from_url(url, settings.PYPLN_CREDENTIALS) _id = article['articles_id'] _id_temp = article['_id'] except RuntimeError as e: logger.error("The document {} could not be found on the PyPLN collection: {}".format(url, e)) continue if '_exception' in my_doc.properties: logger.warning("PyPLN found an error {}".format(article['pypln_url'])) articles.update({'_id': _id}, {'$set': {'status': 2}}) pypln_temp.remove({'_id': _id_temp}) continue if len(my_doc.properties) < 29: if 'time' in article: if (datetime.datetime.now() - article['time']).seconds/60 > 5: logger.warning("PyPLN could not finish the analysis {}".format(article['pypln_url'])) articles.update({'_id': _id}, {'$set': {'status': 2}}) pypln_temp.remove({'_id': _id_temp}) else: continue else: pypln_temp.update({'_id': _id_temp}, {'$set': {'time': datetime.datetime.now()}}) else: analysis = {'articles_id': _id} for property in my_doc.properties: analysis[property] = my_doc.get_property(property) articles_analysis.insert(analysis) articles.update({'_id': _id}, {'$set': {'status': 1}}) pypln_temp.remove({'_id': _id_temp}) cursor = pypln_temp.find() cursor.close()
def test_properties_is_a_list_of_properties(self, mocked_get): """ When accessing `document.properties' the user should get a list of properties, not a url for the resource.""" expected_properties = [ "mimetype", "freqdist", "average_sentence_repertoire", "language", "average_sentence_length", "sentences", "momentum_1", "pos", "momentum_3", "file_metadata", "tokens", "repertoire", "text", "tagset", "momentum_4", "momentum_2" ] mocked_get.return_value.status_code = 200 mocked_get.return_value.json.return_value = { 'properties': [ self.example_json['properties'] + prop + '/' for prop in expected_properties ] } document = Document(session=self.session, **self.example_json) self.assertEqual(document.properties, expected_properties) mocked_get.assert_called_with(self.example_json['properties'])