コード例 #1
0
ファイル: pipelines.py プロジェクト: jacklcz/scrapy
class SingleMongodbPipeline(object):
    u"""
    @summary: save the data to mongodb.
    """
    MONGODB_SERVER = "localhost"
    MONGODB_PORT = 27017
    MONGODB_DB = "books_fs"

    def __init__(self):
        u"""
        The only async framework that PyMongo fully supports is Gevent.
        
        Currently there is no great way to use PyMongo in conjunction with Tornado or Twisted. 
        PyMongo provides built-in connection pooling, 
        so some of the benefits of those frameworks can be achieved 
        just by writing multi-threaded code that shares a MongoClient.
        """
        self.client = None
        self.db = None

    def open_spider(self, spider):
        u'''
        @summary: spider开启时调用
        '''
        self.MONGODB_SERVER = SingleMONGODB_SERVER
        self.MONGODB_PORT = SingleMONGODB_PORT
        self.MONGODB_DB = SingleMONGODB_DB

        try:
            self.client = MongoClient(self.MONGODB_SERVER, self.MONGODB_PORT)
            self.db = self.client[self.MONGODB_DB]
        except Exception as e:
            print log.ERROR("ERROR(SingleMongodbPipeline): %s" % (str(e),))
            traceback.print_exc()

    def process_item(self, item, spider):
        if(not isinstance(item, GoodsItem)):
            return item

        result = self.db['goodsitems'].insert(dict(item))
#        item["mongodb_id"] = str(result)
#
#        log.msg("Item %s wrote to MongoDB database %s/book_detail" %
#                (result, self.MONGODB_DB), level=log.DEBUG, spider=spider)
        return item


    def close_spider(self, spider):
        u'''
        @summary: spider关闭时调用
        '''
        if(self.client):
            self.client.close()
コード例 #2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--last_id", type=int, default=1)
    parser.add_argument("--index", type=str, default='sunrise3')
    parser.add_argument("--force", action="store_true")
    parser.add_argument("--cached", action="store_true")
    parser.add_argument("--stathat", action="store_true")

    args = parser.parse_args()
    print args

    db = MongoClient()['test']
    es = Elasticsearch()
    stats = StatHat('hq08Ng2ujA8o3VPe')

    lastfm_url = "http://ws.audioscrobbler.com/2.0/?api_key=048cd62229f507d6d577393a6d7ac972&format=json"
    factory = Factory(db, lastfm_url)
    factory.cached = args.cached

    last_id = args.last_id

    while True:
        where = {'_id': {'$gt': last_id}, 'end_ts': {'$gt': 0}}
        #where = {'_id': {'$gt': last_id}}
        if not args.force:
            where['pub_ts'] = 0

        print where

        oid = last_id
        for air in db.air.find(where).sort('ts').limit(100):
            oid = air['_id']
            audio = factory.build_audio_from_air(air)
            es.index(index=args.index, doc_type='audio', id=oid, body=audio)
            if not args.force:
                db.air.update({'_id': oid},
                              {'$set': {
                                  'pub_ts': int(time.time())
                              }})
            print '---' * 10, oid
            pp(audio)

            if args.stathat:
                stats.count('index.audio', 1)
                if audio.get('is_track'):
                    stats.count('index.track', 1)

        if oid == last_id:
            if args.force:
                continue
            else:
                print 'wait for new tracks...'
                time.sleep(10)
        else:
            last_id = oid
コード例 #3
0
 def __init__(self):
     """
         The only async framework that PyMongo fully supports is Gevent.
         
         Currently there is no great way to use PyMongo in conjunction with Tornado or Twisted. PyMongo provides built-in connection pooling, so some of the benefits of those frameworks can be achieved just by writing multi-threaded code that shares a MongoClient.
     """
     
     self.style = color.color_style()
     try:
         client = MongoClient(self.MONGODB_SERVER,self.MONGODB_PORT) 
         self.db = client[self.MONGODB_DB]
     except Exception as e:
         print self.style.ERROR("ERROR(ShardMongodbPipeline): %s"%(str(e),))
         traceback.print_exc()
コード例 #4
0
ファイル: pipelines.py プロジェクト: jacklcz/scrapy
    def open_spider(self, spider):
        u'''
        @summary: spider开启时调用
        '''
        self.MONGODB_SERVER = SingleMONGODB_SERVER
        self.MONGODB_PORT = SingleMONGODB_PORT
        self.MONGODB_DB = SingleMONGODB_DB

        try:
            self.client = MongoClient(self.MONGODB_SERVER, self.MONGODB_PORT)
            self.db = self.client[self.MONGODB_DB]
        except Exception as e:
            print log.ERROR("ERROR(SingleMongodbPipeline): %s" % (str(e),))
            traceback.print_exc()
コード例 #5
0
ファイル: mongodb.py プロジェクト: zixie1991/fetchman
 def __init__(self, host='localhost', port=27017, db='taskdb', user='', password=None, coll='task', pool=100):
     """The only async framework that PyMongo fully supports is Gevent.
     Currently there is no great way to use PyMongo in conjunction with
     Tornado or Twisted. PyMongo provides built-in connection pooling, so
     some of the benefits of those frameworks can be achieved just by
     writing multi-threaded code that shares a MongoClient.
     """
     try:
         client = MongoClient(host, port, max_pool_size=pool)
         if user:
             client.the_database.authenticate(user, password, source=db)
         self.db = client[db]
         self.coll = self.db[coll]
     except Exception as e:
         print('connect to mongodb error.', e)
コード例 #6
0
  'book_detail':\
   {\
   (('book_name',ASCENDING),('author',ASCENDING)):{'name':'book_name_author','unique':True},
   'book_name':{'name':'book_name'},
   'author':{'name':'author'},
   'alias_name':{'name':'alias_name'},
   }\
}


def drop_database(name_or_database):
    if name_or_database and client:
        client.drop_database(name_or_database)


def create_index():
    """
		create index for books_fs.book_detail
	"""
    for k, v in INDEX.items():
        for k, kwargs in v.items():
            client[DATABASE_NAME][K].ensure_index(
                list(key) if type(key) == types.TupleType else key, **kwargs)


if __name__ == '__main__':
    client = MongoClient(DATABASE_HOST, DATABASE_PORT)
    drop_database(DATABASE_NAME)
    create_index()
コード例 #7
0
#!/usr/bin/env python
from pymongo.connection import MongoClient
import csv
import json
""" Script para re-importar los posts a la base de datos """

reader = csv.DictReader(open('../data/redit.csv'),
                        fieldnames=('image_id', 'unixtime', 'rawtime', 'title',
                                    'total_votes', 'reddit_id',
                                    'number_of_upvotes', 'subreddit',
                                    'number_of_downvotes', 'localtime',
                                    'score', 'number_of_comments', 'username'))

conn = MongoClient()
db = conn.reddit
print "Cleaning DB collections %s.%s" % ("reddit", "posts")
db.posts.remove()

for it in reader:
    try:
        it['total_votes'] = int(it['total_votes'])
        it['number_of_upvotes'] = int(it['number_of_upvotes'])
        it['number_of_downvotes'] = int(it['number_of_downvotes'])
        it['score'] = int(it['score'])
        it['number_of_comments'] = int(it['number_of_comments'])
        db.posts.insert(it)
    except Exception as e:
        print e, "while inserting", it

print "Inserted %d records" % db.posts.count()
assert db.posts.count(
コード例 #8
0
 def setUp(self):
     conn = MongoClient()
     db = conn.reddit
     data_collection = db.posts
     self.ej_result = lambda res_name, mask: getattr(db, res_name).find(mask
                                                                        )
コード例 #9
0
 def __init__(self):
     client = MongoClient(settings.MONGO_URI)
     self._db = client[settings.MONGO_DB]
コード例 #10
0
    def setUp(self):
        """
        Setup new MongoDB database, subclasses of simplemongo documents and
        store some test data.

        """
        self.mongo = MongoClient()
        self.database = self.mongo['_simplemongo_testsuite']

        class TestDocument(simplemongo.Document):
            __database__ = self.database
            index_by = [
                {
                    'fields': [
                        ('slug', pymongo.ASCENDING),
                    ],
                    'options': {
                        'unique': True,
                        'sparse': True,
                    },
                }
            ]

        class NamedCollectionDocument(simplemongo.Document):
            __database__ = self.database
            __collection__ = 'namedcol'
            slug_field = 'slugfield'
            slug_source_fields = 'title'

        class StructuredDocument(simplemongo.Document):
            __database__ = self.database
            __attribute_access__ = True
            structure = {
                '_id': unicode,
                'text_field': unicode,
                'int_field': int,
                'float_field': float,
                'bool_field': bool,
                'test_doc': TestDocument,
            }

        self.testdoc = TestDocument
        self.namedcoldoc = NamedCollectionDocument
        self.structureddoc = StructuredDocument

        testcol = self.database['TestDocument']
        testcol.insert({
            'slug': 'test-document-instance',
            'title': 'Test Document Instance',
        })
        testcol.insert({
            'slug': 'test-document-instance-2',
            'title': 'Test Document Instance 2',
        })
        testcol.insert({
            '_id': 'ManualId',
        })
        testcol.insert({
            '_id': 500,
        })
        self.testobjectid = ObjectId()
        testcol.insert({
            '_id': self.testobjectid,
        })

        namedcol = self.database['namedcol']
        namedcol.insert({
            'slugfield': 'named-slug-field',
        })

        structuredcol = self.database['StructuredDocument']
        structuredcol.insert({
            '_id': '1234',
            'text_field': u'A string',
            'int_field': 124,
            'float_field': 124.01,
            'bool_field': True,
        })
コード例 #11
0
class SimpleMongoTestCase(unittest.TestCase):

    def setUp(self):
        """
        Setup new MongoDB database, subclasses of simplemongo documents and
        store some test data.

        """
        self.mongo = MongoClient()
        self.database = self.mongo['_simplemongo_testsuite']

        class TestDocument(simplemongo.Document):
            __database__ = self.database
            index_by = [
                {
                    'fields': [
                        ('slug', pymongo.ASCENDING),
                    ],
                    'options': {
                        'unique': True,
                        'sparse': True,
                    },
                }
            ]

        class NamedCollectionDocument(simplemongo.Document):
            __database__ = self.database
            __collection__ = 'namedcol'
            slug_field = 'slugfield'
            slug_source_fields = 'title'

        class StructuredDocument(simplemongo.Document):
            __database__ = self.database
            __attribute_access__ = True
            structure = {
                '_id': unicode,
                'text_field': unicode,
                'int_field': int,
                'float_field': float,
                'bool_field': bool,
                'test_doc': TestDocument,
            }

        self.testdoc = TestDocument
        self.namedcoldoc = NamedCollectionDocument
        self.structureddoc = StructuredDocument

        testcol = self.database['TestDocument']
        testcol.insert({
            'slug': 'test-document-instance',
            'title': 'Test Document Instance',
        })
        testcol.insert({
            'slug': 'test-document-instance-2',
            'title': 'Test Document Instance 2',
        })
        testcol.insert({
            '_id': 'ManualId',
        })
        testcol.insert({
            '_id': 500,
        })
        self.testobjectid = ObjectId()
        testcol.insert({
            '_id': self.testobjectid,
        })

        namedcol = self.database['namedcol']
        namedcol.insert({
            'slugfield': 'named-slug-field',
        })

        structuredcol = self.database['StructuredDocument']
        structuredcol.insert({
            '_id': '1234',
            'text_field': u'A string',
            'int_field': 124,
            'float_field': 124.01,
            'bool_field': True,
        })

    def tearDown(self):
        """ Drop the test database """
        self.mongo.drop_database(self.database)

    def test_collectionproxy(self):
        """
        can we access the pymongo collection via our metaclass setup? do
        accessess to methods not defined in a Document get passed up to the
        pymongo collection?

        """
        self.assertTrue(
            isinstance(self.testdoc.collection, pymongo.collection.Collection))
        self.assertEqual(
            self.testdoc.insert, self.database['TestDocument'].insert)

    def test_collectionnaming(self):
        """ does automatic and manual collection naming work? """
        self.assertEqual('TestDocument', self.testdoc.collection.name)
        self.assertEqual('namedcol', self.namedcoldoc.collection.name)

    def test_find(self):
        """ does a find return instances of our TestDocument? """
        for doc in self.testdoc.find():
            self.assertTrue(isinstance(doc, self.testdoc))

    def test_find_one(self):
        """ can we find one of our test objects, is it a TestDocument? """
        doc = self.testdoc.find_one({'slug': 'test-document-instance'})
        self.assertTrue(isinstance(doc, self.testdoc))

    def test_create_indexes(self):
        """ TODO: can we create indexes, idempotently? """
        pass

    def test_by_id(self):
        """ can we find our test objects by int, string and objectid _id? """
        for _id in (
                'ManualId', 500, self.testobjectid, str(self.testobjectid)):
            doc = self.testdoc.by_id(_id)
            self.assertTrue(isinstance(doc, self.testdoc))
        # type coercion on defined structure test
        doc = self.structureddoc.by_id(1234)
        self.assertTrue(isinstance(doc, self.structureddoc))

    def test_by_slug(self):
        """ can we retrieve a document by slug in any slugfield? """
        doc = self.testdoc.by_slug('test-document-instance')
        self.assertTrue(isinstance(doc, self.testdoc))

        doc = self.namedcoldoc.by_slug('named-slug-field')
        self.assertTrue(isinstance(doc, self.namedcoldoc))

    def test_collection_update(self):
        """ can we update objects on the collection level? """
        self.testdoc.update(
            {'_id': self.testobjectid},
            {'$set': {'updated': True}})
        doc = self.testdoc.by_id(self.testobjectid)
        self.assertTrue(doc.get('updated'))

    def test_slugify_function(self):
        """ can we slugify correctly? """
        self.assertEqual(
            'simple-test-case',
            simplemongo.slugify('Simple Test Case'))
        self.assertEqual(
            'gtiru-tala-hgar',
            simplemongo.slugify(u'Gætirðu talað hægar'))
        self.assertEqual(
            'some-simplepunctuation',
            simplemongo.slugify(u'some simple:punctuation££!@$'))

    def test_no_database_set(self):
        """
        does subclassing a Document raise NoDatabaseError when __database__
        is not a MongoClient Database instance?

        is NoDatabaseError raised by class methods of a Document without
        a database?

        is NoDatabaseError raised by instantiating a new instance of a
        Document without a set database?

        """
        with self.assertRaises(simplemongo.NoDatabaseSet):
            class BrokenDb(simplemongo.Document):
                __database__ = 123

        with self.assertRaises(simplemongo.NoDatabaseSet):
            simplemongo.Document.find()

        with self.assertRaises(simplemongo.NoDatabaseSet):
            simplemongo.Document.find_one()

        with self.assertRaises(simplemongo.NoDatabaseSet):
            simplemongo.Document.create_indexes()

        with self.assertRaises(simplemongo.NoDatabaseSet):
            simplemongo.Document()

    def test_getattr_exception(self):
        """
        does a Document class raise AttributeError when an attrib isn't found
        and there's no MongoClient collection to reach into?

        """
        with self.assertRaises(AttributeError):
            simplemongo.Document.non_existant_attribute

    def test_index_creation(self):
        """ does create_index work? """
        self.testdoc.create_indexes()
        self.assertIn('slug_1', self.testdoc.index_information())

    def test_save(self):
        """ can we save a new record? """
        new_doc = self.testdoc()
        new_doc.save()
        self.assertTrue(isinstance(new_doc.get('_id'), ObjectId))

    def test_update_and_reload(self):
        """ can we update? will it raise an exception for an unsaved doc? """
        new_doc = self.testdoc()

        with self.assertRaises(simplemongo.DocumentNotSaved):
            new_doc.update()

        with self.assertRaises(simplemongo.DocumentNotSaved):
            new_doc.reload()

        new_doc['state'] = 'initial'
        new_doc.save()
        fetch_doc = self.testdoc.by_id(new_doc['_id'])
        self.assertEqual(new_doc['state'], fetch_doc['state'])

        new_doc.update({'$set': {'state': 'inplace'}}, reload_after=False)
        fetch_doc = self.testdoc.by_id(new_doc['_id'])
        self.assertNotEqual(fetch_doc['state'], new_doc['state'])

        new_doc.reload()
        self.assertEqual(fetch_doc['state'], new_doc['state'])

        new_doc.update({'$set': {'state': 'reloaded'}})
        self.assertEqual(new_doc['state'], 'reloaded')

        new_doc['state'] = 'revised'
        new_doc.update()
        fetch_doc.reload()
        self.assertEqual(new_doc['state'], fetch_doc['state'])

    def test_dict_update(self):
        """ does the original dict update method still work? """
        new_doc = self.testdoc()
        new_doc.dict_update({'dictupdated': True})
        new_doc.save()

        fetch_doc = self.testdoc.by_id(new_doc['_id'])

        self.assertTrue(fetch_doc.get('dictupdated'))

    def test_delete(self):
        """ can we delete a doc? will it except if unsaved? """
        new_doc = self.testdoc()

        with self.assertRaises(simplemongo.DocumentNotSaved):
            new_doc.delete()

        new_doc.save()
        self.assertEqual(new_doc, self.testdoc.by_id(new_doc['_id']))

        new_doc.delete()
        self.assertIsNone(self.testdoc.by_id(new_doc['_id']))

    def test_id_property(self):
        """ does id work as a property? """
        new_doc = self.testdoc()

        self.assertIsNone(new_doc.id)

        new_doc.save()
        self.assertTrue(isinstance(new_doc.id, ObjectId))

    def test_attribute_access(self):
        """ check attribute access works as expected """
        test_doc = self.testdoc()
        test_doc['test_field'] = True

        with self.assertRaises(AttributeError):
            test_doc.test_field

        struct_doc = self.structureddoc()

        with self.assertRaises(AttributeError):
            struct_doc.test_field

        struct_doc['test_field'] = True
        struct_doc['bool_field'] = True

        self.assertEqual(struct_doc['test_field'], struct_doc.test_field)
        self.assertEqual(struct_doc['bool_field'], struct_doc.bool_field)

        test_doc.save()
        struct_doc['test_doc'] = test_doc['_id']
        struct_doc.save()

        self.assertEqual(struct_doc.test_doc, test_doc)

        test_doc_2 = self.testdoc()
        test_doc_2['another_test'] = True
        test_doc_2.save()

        struct_doc_2 = self.structureddoc()
        struct_doc_2['test_doc'] = test_doc_2['_id']
        struct_doc_2.save()

        self.assertEqual(struct_doc_2.test_doc, test_doc_2)

    def test_slugify_method(self):
        """ can we slugify documents? """
        no_slug_field_doc = self.testdoc()
        slug_field_doc = self.namedcoldoc()

        self.assertEqual(
            no_slug_field_doc.slugify(from_text='Test Doc'),
            'test-doc')

        self.assertEqual(
            no_slug_field_doc.slugify(from_text='Test Doc'),
            'test-doc-2')

        no_slug_field_doc['title'] = 'Test Doc'
        self.assertEqual(
            no_slug_field_doc.slugify('title'),
            'test-doc-3')

        with self.assertRaises(ValueError):
            no_slug_field_doc.slugify()

        slug_field_doc['title'] = 'My Test'
        self.assertEqual(
            slug_field_doc.slugify(),
            'my-test')

        no_slug_field_doc['author'] = 'Me'
        self.assertEqual(
            no_slug_field_doc.slugify(['title', 'author']),
            'test-doc-me')
コード例 #12
0
def get_mongo():
    global _mongo
    if not _mongo:
        mongo_host = get_settings("mongo_host")
        _mongo = MongoClient(mongo_host)
    return _mongo