Exemple #1
0
 def startTest(self, test):
     db = MongoClient(
         'localhost',
         settings.MONGO_DATABASE_PORT)[settings.MONGO_DATABASE_NAME]
     for collection_name in db.collection_names():
         if collection_name != 'system.indexes':
             getattr(db, collection_name).remove({})
Exemple #2
0
def main():
    db = MongoClient(host=MONGO_HOST, port=int(MONGO_PORT))[MONGO_DATABASE]

    if 'medb' not in db.collection_names():
        db.create_collection('medb')
        db.messages.ensure_index('id')

        print(
            "OK: Collection 'medb' and indexes were created in '{0}' database".
            format(MONGO_DATABASE))
        print("OK: Now load medb into '{0}' database".format(MONGO_DATABASE))

        # unzip
        call(['unzip', '../data/medb.zip'])

        f = open('medb.txt', 'r')
        for line in f:
            # convert string into tuple
            t = literal_eval(line)
            db.medb.insert({'id': t[0], 'm': t[1], 'e': t[2], 'a': t[3]})
        f.close()

        remove('medb.txt')
        print('OK: MEDB loaded.')
    else:
        print("ERROR: Collection 'medb' already exists")
Exemple #3
0
class Db:
    def __init__(self, host, port, dbname):
        try:
            self.mongodb_host = host
            self.mongodb_port = port
            self.db_name = dbname
            self.connection = MongoClient(self.mongodb_host,
                                          self.mongodb_port)[self.db_name]
        except Exception as e:
            print(e)

    def getCollectionsNames(self):
        """ Returns all collections available in the database """
        return self.connection.collection_names(
            include_system_collections=False)

    def build_collection(self, collection_name):
        """ Access a particular collection given its name """
        try:
            return self.connection[collection_name]
        except Exception as e:
            print(e)

    def initialize_collections(self):
        """ Function to initialize mongodb collections. Return a dictionary w/ each collection. """
        COLLECTIONS = self.getCollectionsNames()

        try:
            collections_dict = {}
            for collection in COLLECTIONS:
                collections_dict[collection] = self.build_collection(
                    collection)
            return collections_dict
        except Exception as e:
            print(e)
Exemple #4
0
class Update():
    def __init__(self, ip, db_name):
        self.db_name = db_name
        self.db = MongoClient('127.0.0.1', 27017)[db_name]
        self.core_name = SOLR_CORE_NAME
        self.solr_url = 'http://127.0.0.1:8999/solr'
        self.solr = SOLR(self.solr_url)

    def check_solr_core(self):
        if not self.solr.solr_core_exists(self.core_name):
            self.solr.create_solr_core(self.core_name)

    def update_data(self, collection):
        def insert(data):
            if not data:
                return
            data_one = data.copy()
            data_one['_id'] = str(data_one['_id'])
            data_one['scene'] = self.db_name
            data_one['topic'] = collection
            if 'super_intention' in data_one:
                if data_one['super_intention'] == '':
                    data_one['super_intention'] = 'null'
            if 'equal_questions' in data_one:
                data_one.pop('equal_questions')
                for q in data['equal_questions']:
                    data_one['question'] = q
                    data_one['question_ik'] = q
                    data_one['question_cn'] = q
                    self.solr.update_solr(data_one, self.core_name)
            elif 'questions' in data_one:
                data_one.pop('questions')
                for q in data['questions']:
                    data_one['question'] = q
                    data_one['question_ik'] = q
                    data_one['question_cn'] = q
                    self.solr.update_solr(data_one, self.core_name)
            else:
                self.solr.update_solr(data_one, self.core_name)

        self.solr.delete_solr_by_query(
            self.core_name,
            'scene_str:' + self.db_name + ' AND topic_str:' + collection)
        data = [x for x in self.db[collection].find()]
        for d in data:
            insert(d)

    def update(self):
        try:
            collections = self.db.collection_names()
            if 'log' in collections:
                collections.remove('log')
            for collection in collections:
                print('start ' + collection)
                self.update_data(collection)
            return 1
        except Exception:
            traceback.print_exc()
            return 0
Exemple #5
0
def get_stores(request):
    """Returns a list of available store names.

    Keyword arguments:
    request -- Django HttpRequest object
    """
    db = MongoClient()['stores']
    return filter(lambda s: s!="system.indexes", db.collection_names())
Exemple #6
0
 def __init__(self):
     db = MongoClient()[self.DATABASE_NAME]
     self.mongo_coll = db[self.COLLECTION_NAME]
     if self.COLLECTION_NAME not in db.collection_names():
         self.mongo_coll.create_index([('TradingDay', pymongo.ASCENDING)],
                                      unique=True)
     # whether to replace all conflicted data
     self.replace_all: bool = False
Exemple #7
0
def get_stores(request):
    """Returns a list of available store names.

    Keyword arguments:
    request -- Django HttpRequest object
    """
    db = MongoClient()['stores']
    return filter(lambda s: s != "system.indexes", db.collection_names())
def setup_db():
    """ Creates a mongodb instance and shuts it down after testing has concluded. """

    client = MongoClient(mongo_addr, mongo_port)[mongo_db_name]

    if len(client.collection_names()) != 0:
        client.connection.drop_database(mongo_db_name)

    #Set debug client for mongo
    if api.common.external_client is None:
        api.common.external_client = client

    return client
Exemple #9
0
def setup_db():
    """ Creates a mongodb instance and shuts it down after testing has concluded. """

    client = MongoClient(mongo_addr, mongo_port)[mongo_db_name]

    if len(client.collection_names()) != 0:
        client.connection.drop_database(mongo_db_name)

    #Set debug client for mongo
    if api.common.external_client is None:
        api.common.external_client = client

    return client
Exemple #10
0
class LocalDataManager:
    def __init__(self):
        self.database_conn = MongoClient(document_class=OrderedDict).phonebook
        self.contacts = self.get_contacts()
        self.file_name = 'phone_book.{}'

    def get_contacts(self):
        return [
            list(x.values())
            for x in list(self.database_conn.contacts.find({}, {'_id': False}))
        ]

    def save_txt(self, file_path=None):
        with open(file_path or self.file_name.format('txt'), 'w') as file:
            for contact in self.contacts:
                file.write('{}\n'.format(' '.join(contact)))
        return 'Successfully saved to .txt file.'

    def save_csv(self, file_path=None):
        with open(file_path or self.file_name.format('csv'), 'w',
                  newline='') as csv_file:
            writer = csv.writer(csv_file, delimiter=';')
            writer.writerow(['First name', 'Last name', 'Phone number'])
            for contact in self.contacts:
                writer.writerow([*contact])
        return 'Successfully saved to .csv file.'

    def gui_saver(self, query):
        if query:
            path, extension = os.path.splitext(query)
            if extension == '.txt':
                return self.save_txt(query)
            if extension == '.csv':
                return self.save_csv(query)

    def check_database(self):
        base_data = (OrderedDict([("_id", 1), ('first_name', 'Vadim'),
                                  ('last_name', 'Kuznetsov'),
                                  ('phone_number', '0101')]),
                     OrderedDict([("_id", 2), ('first_name', 'Ivan'),
                                  ('last_name', 'Petrov'),
                                  ('phone_number', '102')]),
                     OrderedDict([("_id", 8), ('first_name', 'Petr'),
                                  ('last_name', 'Ivanovich'),
                                  ('phone_number', '102')]))
        if 'contacts' not in self.database_conn.collection_names():
            self.database_conn.contacts.insert(base_data)
            return 'Database is successfully created.'
        else:
            return 'Database is successfully loaded.'
Exemple #11
0
def main():
    db = MongoClient(host=MONGO_HOST, port=int(MONGO_PORT))[MONGO_DATABASE]

    if 'messages' not in db.collection_names():
        db.create_collection('messages')
        db.messages.ensure_index('h')
        # TTL-collection for 31 days
        db.messages.ensure_index([('d', DESCENDING)], expireAfterSeconds=60*60*24*31)
        db.messages.ensure_index('f')
        db.messages.ensure_index('a')
        db.messages.ensure_index('p')

        db.users.ensure_index('username', unique=True)
        db.charts.ensure_index('name', unique=True)
        db.cache.ensure_index('type', unique=True)
        print("OK: Collections and indexes were created in '{0}' database".format(MONGO_DATABASE))
    else:
        print("ERROR: Collection 'messages' already exists")
Exemple #12
0
class DB_vectors():
    def __init__(self, ip, port, db_name):
        self.db_name = db_name
        self.data_db = MongoClient(ip, port)[db_name]
        self.vector_db = MongoClient(ip, port)['data_vectors']
        self.common_db = MongoClient(ip, port)['common']
        self.data = set()

    def load_data(self):
        for collection in self.data_db.collection_names():
            if collection == 'refuse2chat':
                continue
            for x in self.data_db[collection].find({}, {'equal_questions':1}):
                for q in x['equal_questions']:
                    self.data.add(q)

        for x in self.common_db['interaction'].find({}, {'equal_questions':1}):
            for q in x['equal_questions']:
                self.data.add(q)
        for x in self.common_db['repeat_guest'].find():
            self.data.add(x['question'])
        for x in self.common_db['repeat_machine'].find():
            self.data.add(x['question'])

        for x in self.data_db['dialogue'].find():
            for q in x['equal_questions']:
                self.data.add(x['super_intention']+q)
        
    def write_data(self):
        self.vector_db['vectors'].drop()
        for q in self.data:
            data = {}
            data['sentence'] = q
            #data['vector'] = [0,0,0]
            data['vector'] = [0,0,0]
            self.vector_db['vectors'].insert(data)
        self.vector_db['vectors'].create_index('sentence')

    def get_vector(self, s):
        try:
            result = self.vector_db['vectors'].find_one({'sentence':s})
            return result['vector']
        except Exception:
            return None
    def setUp(self):
        self.app = app.test_client()
        self.app.TESTING = True

        # Inject test database into application
        db = MongoClient('localhost', 27017).TestDB

        # Drop collection (faster than dropping entire db)
        if 'FirstCollection' in db.collection_names():
            db.drop_collection('FirstCollection')
        webapp.db = db

        new_posters = [{
            "name": "James",
            "url": "my_url"
        }, {
            "name": "James",
            "url": "my_url"
        }]
        webapp.db.FirstCollection.insert_many(new_posters)
Exemple #14
0
def main():
    db = MongoClient(host=MONGO_HOST, port=int(MONGO_PORT))[MONGO_DATABASE]

    if 'messages' not in db.collection_names():
        db.create_collection('messages')
        db.messages.ensure_index('h')
        # TTL-collection for 31 days
        db.messages.ensure_index([('d', DESCENDING)],
                                 expireAfterSeconds=60 * 60 * 24 * 31)
        db.messages.ensure_index('f')
        db.messages.ensure_index('a')
        db.messages.ensure_index('p')

        db.users.ensure_index('username', unique=True)
        db.charts.ensure_index('name', unique=True)
        db.cache.ensure_index('type', unique=True)
        print("OK: Collections and indexes were created in '{0}' database".
              format(MONGO_DATABASE))
    else:
        print("ERROR: Collection 'messages' already exists")
Exemple #15
0
    def showCollection(MongoClient,dummyarg):
        Collections = MongoClient.collection_names()
        print("Found {0} collection(s)".format(len(Collections)))
        if len(Collections) > 3:
            # Get a confirmation (y) from console
            flag = input(
                "Found more than one Collections in the database, input [y] if you want to show them all, otherwise I will exit....")
            if flag != 'y':
                print("{0} entered, bye".format(flag))
                raise SystemExit
        elif len(Collections) < 1:
            print("No collections in {1}".format(MongoClient.full_name))

        # To traversal all collections in a db
        for my_collection in Collections:
            print("+--------------------------+")
            print("Print one record in collection [{0}]".format(MongoClient[my_collection].full_name))
            print("Total records in collection <{0}>".format(MongoClient[my_collection].count()))
            # print(dbclient[my_collection].explain())

            pprint.pprint(MongoClient[my_collection].find_one())
            print("+--------------------------+")
Exemple #16
0
    def test_reset(self):
        """
        Check if all normalized collections get dropped and that normalized is set to False in
        all hpfeed entries.
        """

        db = MongoClient('localhost', 27017)[self.dbname]

        #prepare and insert dummy values directly into the hpfeed collection
        insert_items = [
            {'channel': 'channel1', 'ident': 'ident1', 'payload': 'payload1', 'timestamp': datetime.utcnow(),
             'normalized': True},
            {'channel': 'channel2', 'ident': 'ident2', 'payload': 'payload2', 'timestamp': datetime.utcnow(),
             'normalized': True},
            {'channel': 'channel3', 'ident': 'ident3', 'payload': 'payload3', 'timestamp': datetime.utcnow(),
             'normalized': True, 'last_error': "Some error", 'last_error_timestamp': datetime.now()}
        ]

        for item in insert_items:
            db['hpfeed'].insert(item)
            #create a few dummy collection that we expect to get dropped
        db['somecollection1'].insert({'something': 'something'})
        db['somecollection2'].insert({'something': 'something'})

        sut = mnemodb.MnemoDB(self.dbname)
        #This is the function we are testing
        sut.reset_normalized()

        #has normalized collections been removed
        self.assertNotIn('somecollection1', db.collection_names())
        self.assertNotIn('somecollection2', db.collection_names())

        #has all normalized been set to True
        self.assertEquals(0, db['hpfeed'].find({'normalized': True}).count())
        #has last_error attribute been removed
        self.assertEquals(0, db['hpfeed'].find({'last_error': {'$exists': 1}}).count())
        #has last_error_timestamp attribute been removed
        self.assertEquals(0, db['hpfeed'].find({'last_error_timestamp': {'$exists': 1}}).count())
Exemple #17
0
def main():
    db = MongoClient(host=MONGO_HOST, port=int(MONGO_PORT))[MONGO_DATABASE]

    if 'medb' not in db.collection_names():
        db.create_collection('medb')
        db.messages.ensure_index('id')

        print("OK: Collection 'medb' and indexes were created in '{0}' database".format(MONGO_DATABASE))
        print("OK: Now load medb into '{0}' database".format(MONGO_DATABASE))

        # unzip
        call(['unzip', '../data/medb.zip'])

        f = open('medb.txt', 'r')
        for line in f:
            # convert string into tuple
            t = literal_eval(line)
            db.medb.insert({'id': t[0], 'm': t[1], 'e': t[2], 'a': t[3]})
        f.close()

        remove('medb.txt')
        print('OK: MEDB loaded.')
    else:
        print("ERROR: Collection 'medb' already exists")
Exemple #18
0
class MongoOP:

    def __init__(self, mongo_uri='mongodb://localhost:27017/test', \
                collect_name='videos_update', old_collect_name='videos'):
        self.db = MongoClient(mongo_uri).get_default_database()
        self.collect_name = collect_name
        self.old_collect_name = old_collect_name

    def update_json_list(self, json_list, collect_name=None):
        collect = self.get_collection(collect_name)

        if collect_name == 'videos_new':
            print("new Videos drop")
            collect.drop()

        for idx, json in enumerate(json_list):
            if idx % 100 == 0 and idx > 0:
                print("update into collect {} : {} / {}".format(
                    collect_name, idx, len(json_list)))
            collect.update_one({'url': json['url']}, {'$set': json},
                               upsert=True)

    def delete_url(self, url, collect_name=None):
        collect = self.get_collection(collect_name)

        collect.delete_one({'url': url})

    def info_is_exists(self, url, collect_name=None):
        collect = self.get_collection(collect_name)

        return bool(collect.find_one({'url': url, 'title': {'$exists': True}}))

    def get_unfinished_url_list(self, collect_name=None):
        collect = self.get_collection(collect_name)

        url_json_list = list(
            collect.find({'update_date': {
                '$exists': False
            }}, {
                'url': 1,
                '_id': 0
            }))
        return url_json_list

    def get_all_url_set(self, collect_name):
        collect = self.get_collection(collect_name)

        url_set = set(each['url'] for each \
                     in collect.find({'update_date': {'$exists':True}}, {'url':1, '_id':0}))
        return url_set

    def get_film_info_list(self, url_list, collect_name=None):
        collect = self.get_collection(collect_name)

        info_json_list = list(collect.find({'url': {'$in': url_list}}))
        return info_json_list

    def get_collection(self, collect_name):
        if collect_name is None:
            return self.db[self.collect_name]
        else:
            return self.db[collect_name]

    def get_url_update_date(self, url, collect_name=None):
        collect = self.get_collection(collect_name)

        return collect.find_one({'url': url, 'update_date': {'$exists':True}}, \
                                {'update_date':1, '_id':0})

    def get_logs(self, collect_name='logs'):
        collect = self.db[collect_name]

        for each in collect.find():
            print(each)

    def rename_collection(self, old_name, new_name, drop=False):
        if new_name in self.db.collection_names() and drop:
            self.drop_collection(new_name)
        old_collect = self.db[old_name]
        old_collect.rename(new_name)

    def drop_collection(self, collect_name):
        self.db.drop_collection(collect_name)
Exemple #19
0
class CheapVol:
    """
        Profits from buying shitcoins in accumulation phase.
        Enters when price is still cheap and volume spikes

        Config Requirements:
            - periodsMA
            - periodsVolLong
            - periodsVolShort
            - volCoef
            - bolStd
    """
    def __init__(self, stratName, assetList, isTest=False):
        logging.debug("Initialising CheapVol()")
        pd.options.mode.chained_assignment = None
        self.assetList = assetList
        self.isTest = isTest
        with open("%s/Pipeline/resources/%s/config.yml" %
                  (Settings.BASE_PATH, stratName)) as configFile:
            params = yaml.load(configFile)
            self.enterParams = params["enter"]
            self.exchangeList = params["assetSelection"]["exchangeList"]
        self.db = MongoClient("localhost", 27017)[stratName]
        self.col = self.db["PastPriceAction"]
        self.initiateCollection() if not self.isTest else None

    def _initSingle(self, asset, exchange, testData=[]):
        logging.debug("Starting CheapVol._initSingle(asset=%s)" % asset)
        logging.debug("1 second sleep to avoid rate limiters")
        time.sleep(1.5 if not self.isTest else 0)
        try:
            pullData = (Pull(
                emailOnFailure=False if self.isTest else True).candles(
                    asset="%sBTC" % asset,
                    exchange=exchange,
                    limit=max(
                        self.enterParams["periodsMA"],
                        self.enterParams["periodsVolLong"],
                    ) + 1,
                    interval=self.enterParams["granularity"],
                ) if len(testData) == 0 else testData)
            priceList = list(
                pullData["close"])[-self.enterParams["periodsMA"]:]
            volList = list(
                pullData["volume"])[-self.enterParams["periodsVolLong"]:]
            if (len(priceList) == self.enterParams["periodsMA"]
                    and len(volList) == self.enterParams["periodsVolLong"]):
                self.col.insert_one({
                    "asset": asset,
                    "price": priceList,
                    "vol": volList,
                    "isLive": False,
                })
                return True
            else:
                logging.info("Not enough data for asset: %s" % asset)
        except IndexError:
            logging.warning("Failure on asset: %s" % asset)
        return False

    def initiateCollection(self):
        """
            Creates mongo collection which contains the price action data required for CheapVol
        """
        failList = []
        logging.debug("Starting CheapVol.init()")
        if "PastPriceAction" in self.db.collection_names():
            self.db.drop_collection("PastPriceAction")
        for asset, exchange in self.assetList:
            if not self._initSingle(asset, exchange):
                failList.append(asset)
        logging.debug("No failed assets" if len(failList) ==
                      0 else "%s Failed assets: %s" %
                      (len(failList), failList))
        logging.debug("Finished CheapVol.initiateCollection()")
        return failList if self.isTest else None

    def _getPADict(self, exchange):
        logging.debug("Starting CheapVol._getPADict()")
        startTS = int(time.time() - self.enterParams["granularity"])
        dateStart = datetime.fromtimestamp(startTS).strftime(
            "%Y-%m-%dT%H:%M:%S.000Z")
        return Pull().getPriceAction(exchange=exchange,
                                     startDate=dateStart,
                                     baseAsset="BTC")

    def before(self, testData=None):
        """
            Runs before CheapVol on each asset and updates the mongo collection
        """
        logging.debug("Starting CheapVol.before()")
        newPA = {}
        delistDict = {}
        # using reversed to keep exchange priority
        for exchange in reversed(self.exchangeList):
            newPA.update(
                self._getPADict(
                    exchange=exchange) if not self.isTest else testData)
            delistDict.update(Pull().getDepositStatus(
                exchange=exchange)) if not self.isTest else {}
        for assetDict in list(self.col.find()):
            assetDict["price"] = assetDict["price"][1:] + [
                newPA[assetDict["asset"]]["price"]
            ]
            assetDict["vol"] = assetDict["vol"][1:] + [
                newPA[assetDict["asset"]]["vol"]
            ]
            assetDict["isLive"] = (delistDict[assetDict["asset"]]
                                   if not self.isTest else True)
            assetDict.pop("_id", None)
            self.col.find_one_and_replace({"asset": assetDict["asset"]},
                                          assetDict)
        logging.debug("Finished CheapVol.before()")

    def run(self, asset):
        logging.debug("Starting CheapVol.run(asset=%s)" % asset)
        assetData = self.col.find_one({"asset": asset})
        if assetData:
            volL = np.round(
                np.nanmean(np.array(assetData["vol"]).astype(np.float)), 5)
            volS = np.round(
                np.nanmean(
                    np.array(assetData["vol"]
                             [-self.enterParams["periodsVolShort"]:]).astype(
                                 np.float)),
                5,
            )
            priceData = np.array(assetData["price"]).astype(np.float)
            bolDown = np.nanmean(
                priceData) - self.enterParams["bolStd"] * np.nanstd(priceData)
            logging.debug("volL: %s, volS: %s, price: %s, bolDown: %s" %
                          (volL, volS, priceData[-1], bolDown))
            return (volS > self.enterParams["volCoef"] * volL
                    and priceData[-1] < bolDown and assetData["isLive"])
        else:
            return False
Exemple #20
0
class MgHelper(object):
    """ 并保存到MongoDB """
    def __init__(self, server='localhost', port=27017, dbname='Collections'):
        self.server = server
        self.port = port
        self.dbname = dbname
        self.mongo = MongoClient(self.server, self.port)
        self.db = self.mongo[self.dbname]

    def SaveFile(self, fname, tbname='jsontxt'):
        #把json文件存成一行
        f = file(fname)
        j = json.loads(f.read())
        table = self.db[tbname]  #表名
        id = table.save(j)
        return id

    def SaveDictObjs(self, dictListObj, tbname='jsontxt'):
        table = self.db[tbname]  #表名
        for dictObj in dictListObj:
            # jsObj = json.dumps(dictObj)
            id = table.insert(dictObj)
        return id

    def SaveDictObj(self, dobj, tbname='jsontxt'):
        #把dict存成一行
        table = self.db[tbname]  #表名
        id = table.insert(dobj)
        return id

    def GetDBNames(self):
        # ls=list(db.Blog.find({"QQ":"2273075635"}))
        #  tuple(ls[0])
        # (u'QQ', u'Comment', u'isTransfered', u'Like', u'Title', u'URL', u'Transfer', u'Blog_cont', u'Share', u'Source', u'PubTime', u'_id')
        dbnames = self.mongo.collection_names(include_system_collections=False)
        return dbnames

    def GetDictHeader(self, tbname='jsontxt', con={}):
        dictObj = self.db[tbname].find_one(con)
        header = tuple(dictObj)
        return header

    def GetDictObj(self, tbname='jsontxt', con={}):
        dictObj = self.db[tbname].find_one(con)
        return dictObj

    def GetDictObjs(self, tbname='jsontxt', con={}):
        dictObjs = list(self.db[tbname].find(con))
        return dictObjs

    def GetDictObjsCnt(self, tbname='jsontxt', con={}):
        dictObjs = self.db[tbname].find(con).count()
        return dictObjs

    def RemoveTable(self, tbname='jsontxt', con={}):
        self.db[tbname].remove(con)
        return True

    def InsertTable(self, tbname='jsontxt', dictObjs={}):
        self.db[tbname].insert(dictObjs)
        return True
class Provider:
    provider_code = ""
    provider_name = ""
    provider_url = ""

    connect_timeout = 7
    read_timeout = 30

    @property
    def usage_limit_cache_duration(self):
        return (12 + randint(-2, 2)) * 3600

    @property
    def google_api_error_cache_duration(self):
        return (60 + randint(-2, 2)) * 24 * 3600

    @property
    def google_api_cache_duration(self):
        return (120 + randint(-2, 2)) * 24 * 3600

    def __init__(self):
        self.mongo_db = MongoClient(MONGODB_URL).get_database()
        self.__stations_collection = self.mongo_db.stations
        self.__stations_collection.create_index([
            ("loc", GEOSPHERE),
            ("status", ASCENDING),
            ("pv-code", ASCENDING),
            ("short", ASCENDING),
            ("name", ASCENDING),
        ])
        self.collection_names = self.mongo_db.collection_names()
        self.redis = redis.StrictRedis.from_url(url=REDIS_URL,
                                                decode_responses=True)
        self.google_api_key = GOOGLE_API_KEY
        self.log = logging.getLogger(self.provider_code)
        sentry_sdk.set_tag("provider", self.provider_name)

    def stations_collection(self):
        return self.__stations_collection

    def measures_collection(self, station_id):
        if station_id not in self.collection_names:
            self.mongo_db.create_collection(
                station_id, **{
                    "capped": True,
                    "size": 500000,
                    "max": 5000
                })
            self.collection_names.append(station_id)
        return self.mongo_db[station_id]

    def __to_wind_direction(self, value):
        if isinstance(value, ureg.Quantity):
            return to_int(value.to(ureg.degree).magnitude, mandatory=True)
        else:
            return to_int(value, mandatory=True)

    def __to_wind_speed(self, value):
        if isinstance(value, ureg.Quantity):
            return to_float(value.to(ureg.kilometer / ureg.hour).magnitude,
                            mandatory=True)
        else:
            return to_float(value, mandatory=True)

    def __to_temperature(self, value):
        if isinstance(value, ureg.Quantity):
            return to_float(value.to(ureg.degC).magnitude)
        else:
            return to_float(value)

    def __to_pressure(self, value):
        if isinstance(value, ureg.Quantity):
            return to_float(value.to(ureg.hPa).magnitude, ndigits=4)
        else:
            return to_float(value, ndigits=4)

    def __compute_pressures(self, p: Pressure, altitude, temperature,
                            humidity):
        # Normalize pressure to HPa
        qfe = self.__to_pressure(p.qfe)
        qnh = self.__to_pressure(p.qnh)
        qff = self.__to_pressure(p.qff)

        if qfe and qnh is None:
            qnh = TWxUtils.StationToAltimeter(qfe, elevationM=altitude)

        if qnh and qfe is None:
            qfe = TWxUtils.AltimeterToStationPressure(qnh, elevationM=altitude)

        if qfe and qff is None and temperature is not None and humidity is not None:
            qff = TWxUtils.StationToSeaLevelPressure(qfe,
                                                     elevationM=altitude,
                                                     currentTempC=temperature,
                                                     meanTempC=temperature,
                                                     humidity=humidity)
        if qff and qfe is None and temperature is not None and humidity is not None:
            qfe = TWxUtils.SeaLevelToStationPressure(qff,
                                                     elevationM=altitude,
                                                     currentTempC=temperature,
                                                     meanTempC=temperature,
                                                     humidity=humidity)

        return {
            "qfe": to_float(qfe),
            "qnh": to_float(qnh),
            "qff": to_float(qff)
        }

    def __to_altitude(self, value):
        if isinstance(value, ureg.Quantity):
            return to_int(value.to(ureg.meter).magnitude)
        else:
            return to_int(value)

    def __to_rain(self, value):
        if isinstance(value, ureg.Quantity):
            return to_float(
                value.to(ureg.liter / (ureg.meter**2)).magnitude, 1)
        else:
            return to_float(value, 1)

    def add_redis_key(self, key, values, cache_duration):
        pipe = self.redis.pipeline()
        pipe.hmset(key, values)
        pipe.expire(key, cache_duration)
        pipe.execute()

    def call_google_api(self, api_url, api_name):
        url = furl(api_url)
        url.args["key"] = self.google_api_key
        result = requests.get(url.url,
                              timeout=(self.connect_timeout,
                                       self.read_timeout)).json()
        if result["status"] == "OVER_QUERY_LIMIT":
            raise UsageLimitException(f"{api_name} OVER_QUERY_LIMIT")
        elif result["status"] == "INVALID_REQUEST":
            raise ProviderException(
                f'{api_name} INVALID_REQUEST: {result.get("error_message", "")}'
            )
        elif result["status"] == "ZERO_RESULTS":
            raise ProviderException(f"{api_name} ZERO_RESULTS")
        return result

    def __compute_elevation(self, lat, lon) -> Tuple[float, bool]:
        radius = 500
        nb = 6
        path = f"{lat},{lon}|"
        for k in range(nb):
            angle = math.pi * 2 * k / nb
            dx = radius * math.cos(angle)
            dy = radius * math.sin(angle)
            path += "{lat},{lon}".format(
                lat=str(lat + (180 / math.pi) * (dy / 6378137)),
                lon=str(lon + (180 / math.pi) *
                        (dx / 6378137) / math.cos(lat * math.pi / 180)),
            )
            if k < nb - 1:
                path += "|"

        result = self.call_google_api(
            f"https://maps.googleapis.com/maps/api/elevation/json?locations={path}",
            "Google Elevation API")
        elevation = float(result["results"][0]["elevation"])
        is_peak = False
        for point in result["results"][1:]:
            try:
                glide_ratio = radius / (elevation - float(point["elevation"]))
            except ZeroDivisionError:
                glide_ratio = float("Infinity")
            if 0 < glide_ratio < 6:
                is_peak = True
                break
        return elevation, is_peak

    def __get_place_geocoding_results(self, results):
        lat, lon, address_long_name = None, None, None

        for result in results["results"]:
            if result.get("geometry", {}).get("location"):
                lat = result["geometry"]["location"]["lat"]
                lon = result["geometry"]["location"]["lng"]
                for component in result["address_components"]:
                    if "postal_code" not in component["types"]:
                        address_long_name = component["long_name"]
                        break
                break
        return lat, lon, address_long_name

    def __get_place_autocomplete(self, name):
        results = self.call_google_api(
            f"https://maps.googleapis.com/maps/api/place/autocomplete/json?input={name}",
            "Google Places API")
        place_id = results["predictions"][0]["place_id"]

        results = self.call_google_api(
            f"https://maps.googleapis.com/maps/api/geocode/json?place_id={place_id}",
            "Google Geocoding API")
        return self.__get_place_geocoding_results(results)

    def __get_place_geocoding(self, name):
        results = self.call_google_api(
            f"https://maps.googleapis.com/maps/api/geocode/json?address={name}",
            "Google Geocoding API")
        return self.__get_place_geocoding_results(results)

    def get_station_id(self, provider_id):
        return self.provider_code + "-" + str(provider_id)

    def __create_station(self, provider_id, short_name, name, latitude,
                         longitude, altitude, is_peak, status, tz, urls,
                         fixes):
        if fixes is None:
            fixes = {}

        if any((not short_name, not name, altitude is None, latitude is None,
                longitude is None, not status, not tz)):
            raise ProviderException("A mandatory value is none!")

        station = {
            "pv-id": provider_id,
            "pv-code": self.provider_code,
            "pv-name": self.provider_name,
            "url": urls,
            "short": fixes.get("short") or short_name,
            "name": fixes.get("name") or name,
            "alt":
            self.__to_altitude(fixes["alt"] if "alt" in fixes else altitude),
            "peak": to_bool(fixes["peak"] if "peak" in fixes else is_peak),
            "loc": {
                "type":
                "Point",
                "coordinates": [
                    to_float(
                        fixes["longitude"]
                        if "longitude" in fixes else longitude, 6),
                    to_float(
                        fixes["latitude"] if "latitude" in fixes else latitude,
                        6),
                ],
            },
            "status": status,
            "tz": tz,
            "seen": arrow.utcnow().int_timestamp,
        }
        return station

    def save_station(
        self,
        provider_id,
        short_name,
        name,
        latitude,
        longitude,
        status: StationStatus,
        altitude=None,
        tz=None,
        url=None,
        default_name=None,
        lookup_name=None,
    ):
        if provider_id is None:
            raise ProviderException("'provider id' is none!")
        station_id = self.get_station_id(provider_id)
        lat = to_float(latitude, 6)
        lon = to_float(longitude, 6)

        address_key = f"address/{lat},{lon}"
        if (not short_name or not name) and not self.redis.exists(address_key):
            try:
                results = self.call_google_api(
                    f"https://maps.googleapis.com/maps/api/geocode/json?latlng={lat},{lon}"
                    f"&result_type=airport|colloquial_area|locality|natural_feature|point_of_interest|neighborhood",
                    "Google Geocoding API",
                )

                address_short_name = None
                address_long_name = None
                for result in results["results"]:
                    for component in result["address_components"]:
                        if "postal_code" not in component["types"]:
                            address_short_name = component["short_name"]
                            address_long_name = component["long_name"]
                            break
                if not address_short_name or not address_long_name:
                    raise ProviderException(
                        "Google Geocoding API: No valid address name found")
                self.add_redis_key(
                    address_key,
                    {
                        "short": address_short_name,
                        "name": address_long_name
                    },
                    self.google_api_cache_duration,
                )
            except TimeoutError as e:
                raise e
            except UsageLimitException as e:
                self.add_redis_key(address_key, {"error": repr(e)},
                                   self.usage_limit_cache_duration)
            except Exception as e:
                if not isinstance(e, ProviderException):
                    self.log.exception("Unable to call Google Geocoding API")
                self.add_redis_key(address_key, {"error": repr(e)},
                                   self.google_api_error_cache_duration)

        address = lookup_name or name or short_name
        geolocation_key = f"geolocation/{address}"
        if (lat is None or lon is None) or (lat == 0 and lon == 0):
            if not self.redis.exists(geolocation_key):
                try:
                    lat, lon, address_long_name = self.__get_place_geocoding(
                        address)
                    if not lat or not lon or not address_long_name:
                        raise ProviderException(
                            f"Google Geocoding API: No valid geolocation found {address}"
                        )
                    self.add_redis_key(
                        geolocation_key,
                        {
                            "lat": lat,
                            "lon": lon,
                            "name": address_long_name
                        },
                        self.google_api_cache_duration,
                    )
                except TimeoutError as e:
                    raise e
                except UsageLimitException as e:
                    self.add_redis_key(geolocation_key, {"error": repr(e)},
                                       self.usage_limit_cache_duration)
                except Exception as e:
                    if not isinstance(e, ProviderException):
                        self.log.exception(
                            "Unable to call Google Geocoding API")
                    self.add_redis_key(geolocation_key, {"error": repr(e)},
                                       self.google_api_error_cache_duration)
            if self.redis.exists(geolocation_key):
                if self.redis.hexists(geolocation_key, "error"):
                    raise ProviderException(
                        f'Unable to determine station geolocation: {self.redis.hget(geolocation_key, "error")}'
                    )
                lat = to_float(self.redis.hget(geolocation_key, "lat"), 6)
                lon = to_float(self.redis.hget(geolocation_key, "lon"), 6)
                if not name:
                    name = self.redis.hget(geolocation_key, "name")

        alt_key = f"alt/{lat},{lon}"
        if not self.redis.exists(alt_key):
            try:
                elevation, is_peak = self.__compute_elevation(lat, lon)
                self.add_redis_key(alt_key, {
                    "alt": elevation,
                    "is_peak": str(is_peak)
                }, self.google_api_cache_duration)
            except TimeoutError as e:
                raise e
            except UsageLimitException as e:
                self.add_redis_key(alt_key, {"error": repr(e)},
                                   self.usage_limit_cache_duration)
            except Exception as e:
                if not isinstance(e, ProviderException):
                    self.log.exception("Unable to call Google Elevation API")
                self.add_redis_key(alt_key, {"error": repr(e)},
                                   self.google_api_error_cache_duration)

        tz_key = f"tz/{lat},{lon}"
        if not tz and not self.redis.exists(tz_key):
            try:
                now = arrow.utcnow().int_timestamp
                result = self.call_google_api(
                    f"https://maps.googleapis.com/maps/api/timezone/json?location={lat},{lon}&timestamp={now}",
                    "Google Time Zone API",
                )

                tz = result["timeZoneId"]
                gettz(tz)
                self.add_redis_key(tz_key, {"tz": tz},
                                   self.google_api_cache_duration)
            except TimeoutError as e:
                raise e
            except UsageLimitException as e:
                self.add_redis_key(tz_key, {"error": repr(e)},
                                   self.usage_limit_cache_duration)
            except Exception as e:
                if not isinstance(e, ProviderException):
                    self.log.exception("Unable to call Google Time Zone API")
                self.add_redis_key(tz_key, {"error": repr(e)},
                                   self.google_api_error_cache_duration)

        if not short_name:
            if self.redis.hexists(address_key, "error"):
                if default_name:
                    short_name = default_name
                else:
                    raise ProviderException(
                        f"Unable to determine station 'short': {self.redis.hget(address_key, 'error')}"
                    )
            else:
                short_name = self.redis.hget(address_key, "short")

        if not name:
            if self.redis.hexists(address_key, "error"):
                if default_name:
                    name = default_name
                else:
                    raise ProviderException(
                        f"Unable to determine station 'name': {self.redis.hget(address_key, 'error')}"
                    )
            else:
                name = self.redis.hget(address_key, "name")

        if not altitude:
            if self.redis.hexists(alt_key, "error"):
                raise ProviderException(
                    f"Unable to determine station 'alt': {self.redis.hget(alt_key, 'error')}"
                )
            altitude = self.redis.hget(alt_key, "alt")

        if self.redis.hexists(alt_key, "error") == "error":
            raise ProviderException(
                f"Unable to determine station 'peak': {self.redis.hget(alt_key, 'error')}"
            )
        is_peak = self.redis.hget(alt_key, "is_peak") == "True"

        if not tz:
            if self.redis.hexists(tz_key, "error"):
                raise ProviderException(
                    f"Unable to determine station 'tz': {self.redis.hget(tz_key, 'error')}"
                )
            tz = self.redis.hget(tz_key, "tz")

        if not url:
            urls = {"default": self.provider_url}
        elif isinstance(url, str):
            urls = {"default": url}
        elif isinstance(url, dict):
            if "default" not in url:
                raise ProviderException("No 'default' key in url")
            urls = url
        else:
            raise ProviderException("Invalid url")

        fixes = self.mongo_db.stations_fix.find_one(station_id)
        station = self.__create_station(provider_id, short_name, name, lat,
                                        lon, altitude, is_peak, status.value,
                                        tz, urls, fixes)
        self.stations_collection().update({"_id": station_id},
                                          {"$set": station},
                                          upsert=True)
        station["_id"] = station_id
        return station

    def create_measure(
        self,
        for_station,
        _id,
        wind_direction,
        wind_average,
        wind_maximum,
        temperature=None,
        humidity=None,
        pressure: Pressure = None,
        rain=None,
    ):

        if all((wind_direction is None, wind_average is None,
                wind_maximum is None)):
            raise ProviderException("All mandatory values are null!")

        # Mandatory keys: 0 if not present
        measure = {
            "_id": int(round(_id)),
            "w-dir": self.__to_wind_direction(wind_direction),
            "w-avg": self.__to_wind_speed(wind_average),
            "w-max": self.__to_wind_speed(wind_maximum),
        }

        # Optional keys
        if temperature is not None:
            measure["temp"] = self.__to_temperature(temperature)
        if humidity is not None:
            measure["hum"] = to_float(humidity, 1)
        if pressure is not None and (pressure.qfe is not None or pressure.qnh
                                     is not None or pressure.qff is not None):
            measure["pres"] = self.__compute_pressures(
                pressure, for_station["alt"], measure.get("temp", None),
                measure.get("hum", None))
        if rain is not None:
            measure["rain"] = self.__to_rain(rain)

        measure["time"] = arrow.now().int_timestamp

        fixes = self.mongo_db.stations_fix.find_one(for_station["_id"])
        if fixes and "measures" in fixes:
            for key, offset in fixes["measures"].items():
                try:
                    if key in measure:
                        fixed_value = measure[key] + offset
                        if key == "w-dir":
                            fixed_value = fixed_value % 360
                        measure[key] = fixed_value

                except Exception as e:
                    self.log.exception(
                        f"Unable to fix '{key}' with offset '{offset}': {e}")

        return measure

    def has_measure(self, measure_collection, key):
        return measure_collection.find({"_id": key}).count() > 0

    def __add_last_measure(self, measure_collection, station_id):
        last_measure = measure_collection.find_one({
            "$query": {},
            "$orderby": {
                "_id": -1
            }
        })
        if last_measure:
            self.stations_collection().update({"_id": station_id},
                                              {"$set": {
                                                  "last": last_measure
                                              }})

    def insert_new_measures(self, measure_collection, station, new_measures):
        if len(new_measures) > 0:
            measure_collection.insert(
                sorted(new_measures, key=lambda m: m["_id"]))

            end_date = arrow.Arrow.fromtimestamp(new_measures[-1]["_id"],
                                                 gettz(station["tz"]))
            self.log.info(
                "⏱ {end_date} ({end_date_local}), {short}/{name} ({id}): {nb} values inserted"
                .format(
                    end_date=end_date.format("YY-MM-DD HH:mm:ssZZ"),
                    end_date_local=end_date.to("local").format(
                        "YY-MM-DD HH:mm:ssZZ"),
                    short=station["short"],
                    name=station["name"],
                    id=station["_id"],
                    nb=str(len(new_measures)),
                ))

            self.__add_last_measure(measure_collection, station["_id"])
Exemple #22
0
##
# Main
##


def main():
    if not config['load_hashbands']: count_hashbands()
    match_minhash_keys()
    validate_all_matches()
    cluster_all_matches()
    create_typeahead_collection()
    create_config_collection()
    create_metadata_collection()
    create_scatterplot_collection()


if __name__ == '__main__':

    config = get_config()
    infiles = glob.glob(config['infiles'])
    text_ids = [str(i) for i in range(len(infiles))]
    metadata = get_metadata()

    # validate inputs are present
    if not infiles: raise Exception('No input files were found!')

    # remove all extant records
    db = MongoClient()['intertext']
    [db[c].drop() for c in db.collection_names()]

    main()
Exemple #23
0
    MONGO_DB_VERSION = MONGO_CLIENT.connection.server_info()['version']
except TypeError:
    # for pymongo >= 3
    MONGO_DB_VERSION = MONGO_CLIENT.client.server_info()['version']

if not float('.'.join(MONGO_DB_VERSION.split('.')[:-1])) >= 2.2:
    raise ImproperlyConfigured(
        '''
        Your mongodb service doesn't support TTL
        http://docs.mongodb.org/manual/tutorial/expire-data/
        '''
    )


# create sessions collection if needed
if MONGO_SESSIONS_COLLECTION not in MONGO_CLIENT.collection_names():
    MONGO_CLIENT.create_collection(MONGO_SESSIONS_COLLECTION)

# check existing indexes
DB_COLLECTION = MONGO_CLIENT[MONGO_SESSIONS_COLLECTION]
MONGO_SESSIONS_INDEXES = DB_COLLECTION.index_information()
if len(MONGO_SESSIONS_INDEXES) <= 1:
    DB_COLLECTION.ensure_index(
        'session_key',
        unique=True
    )

    DB_COLLECTION.ensure_index(
        'creation_date',
        expireAfterSeconds=MONGO_SESSIONS_TTL
    )
class App:
    def __init__(self, settings):
        dirname = os.path.dirname(os.path.realpath(__file__)) + '/'
        self.logger = logging.getLogger("bot")
        self.help_text = open(dirname + 'docs/help.txt').read()
        self.changelog_text = open(dirname + 'docs/changelog.txt').read()
        self.welcome_text = open(dirname + 'docs/welcome.txt').read()
        self.about_text = open(dirname + 'docs/about.txt').read()
        self.settings = settings
        self.wnl = WordNetLemmatizer()
        remainder.configure(settings)
        logs_dir = 'logs/'
        if not os.path.exists(logs_dir):
            os.makedirs(logs_dir)

        ###LOGGING
        fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
        access = logging.FileHandler(logs_dir + 'access.log')
        access.setLevel(logging.INFO)
        access.setFormatter(logging.Formatter(fmt))

        error = logging.FileHandler(logs_dir + 'error.log')
        error.setLevel(logging.ERROR)
        error.setFormatter(logging.Formatter(fmt))
        self.logger.addHandler(access)
        self.logger.addHandler(error)

        logging.basicConfig(format=fmt)
        if env == 'debug':
            logging.basicConfig(level=logging.DEBUG)
        logging.warning("Cofiguration: %s" % (env,))

        self.db = MongoClient(settings.mongo['uri']).get_default_database()
        if 'users' not in self.db.collection_names():
            self.db.create_collection('users')
        self.users = self.db.users
        if 'remainders' not in self.db.collection_names():
            self.db.create_collection('remainders')
        remainder.recover_jobs()

        self.params['offset'] = 0
        logging.warning('Constructed')

    def listen(self):
        logging.warning('Listening')
        while True:
            self.get_updates()
            time.sleep(0.1)

            # app.run()

    def correct(self, string):
        baseurl_correction = 'http://service.afterthedeadline.com/checkDocument'
        correction = requests.get(baseurl_correction, {'data': string}).text
        correction = BeautifulSoup(correction,  "lxml")

        if correction.find("option") is not None:
            string = correction.find("option").string
        return string

    def add_word(self, user, string):

        baseurl = 'https://translate.yandex.net/api/v1.5/tr.json/translate'
        # string = re.sub(r'[^A-Za-z\s]', '', string)
        # string = re.sub(r'\Wk+', ' ', string)
        string = string.lower()

        if len(string) == 0:
            telegram.send_message(user['chat_id'], "Wrong word")
            return
        if 'foreign' not in user:
            user['foreign'] = 'en'
            user['native'] = 'ru'

        if user['foreign'] == 'en':
            string = self.correct(string)
            if env != 'debug':
                string = self.wnl.lemmatize(string)
        string = string[0].upper() + string[1:]
        direction = '%s-%s' % (user['foreign'], user['native'])
        transtaltion = requests.get(baseurl, {
            'key': self.settings.translate_yandex['token'],
            'lang': direction,
            'text': string
        })
        out_word = transtaltion.json()['text'][0]

        already_has = False
        for w in user['words']:
            already_has |= w["en"] == string
        if not already_has:
            user['words'].append({"en": string, "ru": out_word,
                                  "stage": study_settings.min_stage,
                                  "expiration_date": datetime.datetime.utcnow() + study_settings.stages[1],
                                  "creation_date": datetime.datetime.utcnow()})
            self.users.save(user)
            telegram.send_message(user['chat_id'], "Word added\n%s - %s" % (string, out_word))
        else:
            telegram.send_message(user['chat_id'], "Already exist!\n%s - %s" % (string, out_word))

    params = {}

    def get_list_word(self, user, text):
        str_out = "\n".join(["%s: (%s) %s - %s" % (i + 1, w['stage'], w['en'], w['ru']) for i, w in
                             zip(range(10 ** 10), user['words'])])
        telegram.send_message(user['chat_id'], str_out)

    def start(self, user, text):
        telegram.send_message(user['chat_id'], self.welcome_text)
        user['state'] = States.langs_asked


    def reask_langs(self, user, text):
        telegram.send_message(user['chat_id'], """
Now choose your native and foreign languages.
Example: "en-ru" (en is foreign and ru is native)
        """)
        user['state'] = States.langs_asked

    def langs_ask(self, user, text):
        ans = requests.get('https://translate.yandex.net/api/v1.5/tr.json/getLangs',
                           {'key': self.settings.translate_yandex['token']})
        lang_list = ans.json()['dirs']
        if text not in lang_list:
            telegram.send_message(user['chat_id'], "Please, choose any of this:\n" + "\n".join(lang_list))
        else:
            telegram.send_message(user['chat_id'], "\"%s\" have successfully chosen" % (text,))
            user['state'] = States.idle

            user['foreign'] = text[0:2]
            user['native'] = text[3:5]

    def help(self, user, text):
        telegram.send_message(user['chat_id'], self.help_text)

    def about(self, user, text):
        telegram.send_message(user['chat_id'], self.about_text)

    def start_train(self, user, text):
        user['train']['type'] = 0
        train.do_train(user, text)

    def add_remainder(self, user, text):
        remainder.remove_job(user)
        tokens = text.split(' ')
        delta = datetime.timedelta()
        if len(tokens) >= 2:
            tokens = tokens[1].replace(' ', '').split(':')
            hours = int(tokens[0])
            minutes = int(tokens[1])
            delta = datetime.timedelta(hours=hours, minutes=minutes)
        remainder.add_job(user, datetime.datetime.utcnow() + delta)
        telegram.send_message(user['chat_id'], "Successfully set. Nearest at  %s" % (datetime.datetime.now() + delta,))

    def remove_remainder(self, user, text):
        remainder.remove_job(user)
        telegram.send_message(user['chat_id'], "Removed")

    def remove(self, user, text):
        if user['train']['type'] != 0:
            for w in user['words']:
                if w == user['train']['word']:
                    user['words'].remove(w)
                    str_out = "%s - %s" % (w['en'], w['ru'])
                    telegram.send_message(user['chat_id'], "Deleted:\n%s" % (str_out,))

            train.do_train(user, text)
        else:
            tokens = text.split(" ")
            if len(tokens) > 1:
                cnt = int(tokens[1])
                if cnt > 0:
                    cnt -= 1
            else:
                cnt = -1
            str_out = "%s - %s" % (user['words'][cnt]['en'], user['words'][cnt]['ru'])
            del user['words'][cnt]
            telegram.send_message(user['chat_id'], "Word with index %s removed\n%s" % (cnt, str_out))

    comands = {
        'list': get_list_word,
        'rm': remove,
        'train': start_train,
        'end': train.end_train,
        'start': start,
        'help': help,
        'setremainder': add_remainder,
        'reask': reask_langs,
        'about': about
    }

    def parse_action(self, chat_id, text):
        self.logger.warning("%s - %s" % (chat_id, text))
        user = self.users.find_one({'chat_id': chat_id})
        if user is None:
            user = {'chat_id': chat_id,
                    'state': States.idle,

                    'words': [],
                    'train': {
                        'type': 0,
                        'words': 0,
                        'correct': 0,
                        'cadidacies': []
                    }}
        if 'train' not in user:
            user['train'] = {
                'type': 0,
                'words': 0,
                'correct': 0,
                'cadidacies': []
            }
        if text[0] == '/':  # Command
            cmd = text[1:].lower().split(' ')[0]
            if cmd in self.comands:
                self.comands[cmd](self, user, text)
        elif user['train']['type'] != 0:
            train.do_train(user, text)
        elif user['state'] == States.idle:
            self.add_word(user, text)
        elif user['state'] == States.langs_asked:
            self.langs_ask(user, text)
        self.users.save(user)

    def get_updates(self):
        try:
            messages = telegram.get_updates(self.params['offset'])
            for u in messages:
                if 'message' in u and 'text' in u['message']:
                    if u['update_id'] < self.params['offset']:
                        print('Error')
                    else:
                        chat_id = u['message']['chat']['id']
                        text = u['message']['text']
                        self.params['offset'] = max(self.params['offset'], u['update_id'] + 1)
                        try:
                            self.parse_action(chat_id, text)
                        except:
                            logging.error('Error! (%s, %s)' % (chat_id, text))
                            logging.error(traceback.print_exc())
                            telegram.send_message(chat_id, 'An error occurred!')
        except:
            logging.error('Get updates error!')
            logging.error(traceback.print_exc())
        self.db.meta.save(self.params)
import requests
import xmltodict
import json
from pymongo import MongoClient
import time

res=requests.get('http://manybooks.net/index.xml')

#Always encode XML to skip most of unicode errors

db = MongoClient('localhost')['manybooks']

encoded_xml = res.text.encode('utf-8')

od = xmltodict.parse(encoded_xml)

myjson = json.loads(json.dumps(od))

today = time.strftime('%y-%m-%d',time.gmtime())

if today not in db.collection_names():
	for item in myjson['rss']['channel']['item']:
		db[today].insert(item)
else:
	print 'already up to date'
Exemple #26
0
from pymongo import MongoClient
from array import array

db = MongoClient().test_database
month = {'Jan':'01','Feb':'02','Mar':'03','Apr':'04','May':'05','Jun':'06','Jul':'07','Aug':'08','Sep':'09','Oct':'10','Nov':'11','Dec':'12'}

for tweet in db.tweets.find():
	tweetDate = tweet["created_at"].split(" ")[5] + month[tweet["created_at"].split(" ")[1]] + tweet["created_at"].split(" ")[2] 
	
	for hashtag in tweet["entities"]["hashtags"]:
		hashtext = hashtag["text"].lower()
		
		nameExist = False
		for name in db.collection_names():
			if name == hashtext:
				nameExist == True
		if nameExist == False:
			db[hashtext] # create collection with hashtag as the name
			
		result = db[hashtext].find_one({"Date": tweetDate})
		if result == None:
			db[hashtext].insert_one(
				{
					"_id": tweetDate, 
					"Date": tweetDate, 
					"count": 1
				}	
			)
		else:
			db[hashtext].update_one(
				{"_id": tweetDate},
Exemple #27
0
class Provider:
    provider_code = ''
    provider_name = ''
    provider_url = ''

    connect_timeout = 7
    read_timeout = 30

    @property
    def usage_limit_cache_duration(self):
        return (12 + randint(-2, 2)) * 3600

    @property
    def location_cache_duration(self):
        return (60 + randint(-2, 2)) * 24 * 3600

    def __init__(self):
        self.mongo_db = MongoClient(MONGODB_URL).get_database()
        self.__stations_collection = self.mongo_db.stations
        self.__stations_collection.create_index([('loc', GEOSPHERE),
                                                 ('status', ASCENDING),
                                                 ('pv-code', ASCENDING),
                                                 ('short', ASCENDING),
                                                 ('name', ASCENDING)])
        self.collection_names = self.mongo_db.collection_names()
        self.redis = redis.StrictRedis.from_url(url=REDIS_URL,
                                                decode_responses=True)
        self.google_api_key = GOOGLE_API_KEY
        self.log = get_logger(self.provider_code)
        sentry_sdk.init(SENTRY_URL, environment=ENVIRONMENT)
        with sentry_sdk.configure_scope() as scope:
            scope.set_tag('provider', self.provider_name)

    def __to_wind_direction(self, value):
        if isinstance(value, ureg.Quantity):
            return to_int(value.to(ureg.degree).magnitude, mandatory=True)
        else:
            return to_int(value, mandatory=True)

    def __to_wind_speed(self, value):
        if isinstance(value, ureg.Quantity):
            return to_float(value.to(ureg.kilometer / ureg.hour).magnitude,
                            mandatory=True)
        else:
            return to_float(value, mandatory=True)

    def __to_temperature(self, value):
        if isinstance(value, ureg.Quantity):
            return to_float(value.to(ureg.degC).magnitude)
        else:
            return to_float(value)

    def __to_pressure(self, value):
        if isinstance(value, ureg.Quantity):
            return to_float(value.to(ureg.hPa).magnitude, ndigits=4)
        else:
            return to_float(value, ndigits=4)

    def __compute_pressures(self, p: Pressure, altitude, temperature,
                            humidity):
        # Normalize pressure to HPa
        qfe = self.__to_pressure(p.qfe)
        qnh = self.__to_pressure(p.qnh)
        qff = self.__to_pressure(p.qff)

        if qfe and qnh is None:
            qnh = TWxUtils.StationToAltimeter(qfe, elevationM=altitude)

        if qnh and qfe is None:
            qfe = TWxUtils.AltimeterToStationPressure(qnh, elevationM=altitude)

        if qfe and qff is None and temperature is not None and humidity is not None:
            qff = TWxUtils.StationToSeaLevelPressure(qfe,
                                                     elevationM=altitude,
                                                     currentTempC=temperature,
                                                     meanTempC=temperature,
                                                     humidity=humidity)
        if qff and qfe is None and temperature is not None and humidity is not None:
            qfe = TWxUtils.SeaLevelToStationPressure(qff,
                                                     elevationM=altitude,
                                                     currentTempC=temperature,
                                                     meanTempC=temperature,
                                                     humidity=humidity)

        return {
            'qfe': to_float(qfe),
            'qnh': to_float(qnh),
            'qff': to_float(qff)
        }

    def __to_altitude(self, value):
        if isinstance(value, ureg.Quantity):
            return to_int(value.to(ureg.meter).magnitude)
        else:
            return to_int(value)

    def __to_rain(self, value):
        if isinstance(value, ureg.Quantity):
            return to_float(
                value.to(ureg.liter / (ureg.meter**2)).magnitude, 1)
        else:
            return to_float(value, 1)

    def stations_collection(self):
        return self.__stations_collection

    def measures_collection(self, station_id):
        if station_id not in self.collection_names:
            self.mongo_db.create_collection(
                station_id, **{
                    'capped': True,
                    'size': 500000,
                    'max': 5000
                })
            self.collection_names.append(station_id)
        return self.mongo_db[station_id]

    def add_redis_key(self, key, values, cache_duration):
        pipe = self.redis.pipeline()
        pipe.hmset(key, values)
        pipe.expire(key, cache_duration)
        pipe.execute()

    def __compute_elevation(self, lat, lon):
        radius = 500
        nb = 6
        path = f'{lat},{lon}|'
        for k in range(nb):
            angle = math.pi * 2 * k / nb
            dx = radius * math.cos(angle)
            dy = radius * math.sin(angle)
            path += '{lat},{lon}'.format(
                lat=str(lat + (180 / math.pi) * (dy / 6378137)),
                lon=str(lon + (180 / math.pi) *
                        (dx / 6378137) / math.cos(lat * math.pi / 180)))
            if k < nb - 1:
                path += '|'

        result = requests.get(
            f'https://maps.googleapis.com/maps/api/elevation/json?locations={path}&key={self.google_api_key}',
            timeout=(self.connect_timeout, self.read_timeout)).json()
        if result['status'] == 'OVER_QUERY_LIMIT':
            raise UsageLimitException('Google Elevation API OVER_QUERY_LIMIT')
        elif result['status'] == 'INVALID_REQUEST':
            raise ProviderException(
                f'Google Elevation API INVALID_REQUEST: {result.get("error_message", "")}'
            )
        elif result['status'] == 'ZERO_RESULTS':
            raise ProviderException('Google Elevation API ZERO_RESULTS')

        elevation = float(result['results'][0]['elevation'])
        is_peak = False
        for point in result['results'][1:]:
            try:
                glide_ratio = radius / (elevation - float(point['elevation']))
            except ZeroDivisionError:
                glide_ratio = float('Infinity')
            if 0 < glide_ratio < 6:
                is_peak = True
                break
        return elevation, is_peak

    def __get_place_geocoding_results(self, results):
        lat, lon, address_long_name = None, None, None

        for result in results['results']:
            if result.get('geometry', {}).get('location'):
                lat = result['geometry']['location']['lat']
                lon = result['geometry']['location']['lng']
                for component in result['address_components']:
                    if 'postal_code' not in component['types']:
                        address_long_name = component['long_name']
                        break
                break
        return lat, lon, address_long_name

    def __get_place_autocomplete(self, name):
        results = requests.get(
            f'https://maps.googleapis.com/maps/api/place/autocomplete/json?input={name}&key={self.google_api_key}',
            timeout=(self.connect_timeout, self.read_timeout)).json()

        if results['status'] == 'OVER_QUERY_LIMIT':
            raise UsageLimitException('Google Places API OVER_QUERY_LIMIT')
        elif results['status'] == 'INVALID_REQUEST':
            raise ProviderException(
                f'Google Places API INVALID_REQUEST: {results.get("error_message", "")}'
            )
        elif results['status'] == 'ZERO_RESULTS':
            raise ProviderException(
                f"Google Places API ZERO_RESULTS for '{name}'")

        place_id = results['predictions'][0]['place_id']

        results = requests.get(
            f'https://maps.googleapis.com/maps/api/geocode/json?place_id={place_id}&key={self.google_api_key}',
            timeout=(self.connect_timeout, self.read_timeout)).json()

        if results['status'] == 'OVER_QUERY_LIMIT':
            raise UsageLimitException('Google Geocoding API OVER_QUERY_LIMIT')
        elif results['status'] == 'INVALID_REQUEST':
            raise ProviderException(
                f'Google Geocoding API INVALID_REQUEST: {results.get("error_message", "")}'
            )
        elif results['status'] == 'ZERO_RESULTS':
            raise ProviderException(
                f"Google Geocoding API ZERO_RESULTS for '{name}'")

        return self.__get_place_geocoding_results(results)

    def __get_place_geocoding(self, name):
        results = requests.get(
            f'https://maps.googleapis.com/maps/api/geocode/json?address={name}&key={self.google_api_key}',
            timeout=(self.connect_timeout, self.read_timeout)).json()
        if results['status'] == 'OVER_QUERY_LIMIT':
            raise UsageLimitException('Google Geocoding API OVER_QUERY_LIMIT')
        elif results['status'] == 'INVALID_REQUEST':
            raise ProviderException(
                f'Google Geocoding API INVALID_REQUEST: {results.get("error_message", "")}'
            )
        elif results['status'] == 'ZERO_RESULTS':
            raise ProviderException(
                f"Google Geocoding API ZERO_RESULTS for '{name}'")

        return self.__get_place_geocoding_results(results)

    def get_station_id(self, provider_id):
        return self.provider_code + '-' + str(provider_id)

    def __create_station(self, provider_id, short_name, name, latitude,
                         longitude, altitude, is_peak, status, tz, urls,
                         fixes):
        if fixes is None:
            fixes = {}

        if any((not short_name, not name, altitude is None, latitude is None,
                longitude is None, not status, not tz)):
            raise ProviderException('A mandatory value is none!')

        station = {
            'pv-id': provider_id,
            'pv-code': self.provider_code,
            'pv-name': self.provider_name,
            'url': urls,
            'short': fixes.get('short') or short_name,
            'name': fixes.get('name') or name,
            'alt':
            self.__to_altitude(fixes['alt'] if 'alt' in fixes else altitude),
            'peak': to_bool(fixes['peak'] if 'peak' in fixes else is_peak),
            'loc': {
                'type':
                'Point',
                'coordinates': [
                    to_float(
                        fixes['longitude']
                        if 'longitude' in fixes else longitude, 6),
                    to_float(
                        fixes['latitude'] if 'latitude' in fixes else latitude,
                        6)
                ]
            },
            'status': status,
            'tz': tz,
            'seen': arrow.utcnow().timestamp
        }
        return station

    def save_station(self,
                     provider_id,
                     short_name,
                     name,
                     latitude,
                     longitude,
                     status,
                     altitude=None,
                     tz=None,
                     url=None,
                     default_name=None,
                     lookup_name=None):

        if provider_id is None:
            raise ProviderException("'provider id' is none!")
        station_id = self.get_station_id(provider_id)
        lat = to_float(latitude, 6)
        lon = to_float(longitude, 6)

        address_key = f'address/{lat},{lon}'
        if (not short_name or not name) and not self.redis.exists(address_key):
            try:
                results = requests.get(
                    f'https://maps.googleapis.com/maps/api/geocode/json?latlng={lat},{lon}'
                    f'&result_type=airport|colloquial_area|locality|natural_feature|point_of_interest|neighborhood'
                    f'&key={self.google_api_key}',
                    timeout=(self.connect_timeout, self.read_timeout)).json()

                if results['status'] == 'OVER_QUERY_LIMIT':
                    raise UsageLimitException(
                        'Google Geocoding API OVER_QUERY_LIMIT')
                elif results['status'] == 'INVALID_REQUEST':
                    raise ProviderException(
                        f'Google Geocoding API INVALID_REQUEST: {results.get("error_message", "")}'
                    )
                elif results['status'] == 'ZERO_RESULTS':
                    raise ProviderException(
                        'Google Geocoding API ZERO_RESULTS')

                address_short_name = None
                address_long_name = None
                for result in results['results']:
                    for component in result['address_components']:
                        if 'postal_code' not in component['types']:
                            address_short_name = component['short_name']
                            address_long_name = component['long_name']
                            break
                if not address_short_name or not address_long_name:
                    raise ProviderException(
                        'Google Geocoding API: No valid address name found')
                self.add_redis_key(address_key, {
                    'short': address_short_name,
                    'name': address_long_name
                }, self.location_cache_duration)
            except TimeoutError as e:
                raise e
            except UsageLimitException as e:
                self.add_redis_key(address_key, {'error': repr(e)},
                                   self.usage_limit_cache_duration)
            except Exception as e:
                if not isinstance(e, ProviderException):
                    self.log.exception('Unable to call Google Geocoding API')
                self.add_redis_key(address_key, {'error': repr(e)},
                                   self.location_cache_duration)

        address = lookup_name or name or short_name
        geolocation_key = f'geolocation/{address}'
        if (lat is None or lon is None) or (lat == 0 and lon == 0):
            if not self.redis.exists(geolocation_key):
                try:
                    lat, lon, address_long_name = self.__get_place_geocoding(
                        address)
                    if not lat or not lon or not address_long_name:
                        raise ProviderException(
                            f'Google Geocoding API: No valid geolocation found {address}'
                        )
                    self.add_redis_key(geolocation_key, {
                        'lat': lat,
                        'lon': lon,
                        'name': address_long_name
                    }, self.location_cache_duration)
                except TimeoutError as e:
                    raise e
                except UsageLimitException as e:
                    self.add_redis_key(geolocation_key, {'error': repr(e)},
                                       self.usage_limit_cache_duration)
                except Exception as e:
                    if not isinstance(e, ProviderException):
                        self.log.exception(
                            'Unable to call Google Geocoding API')
                    self.add_redis_key(geolocation_key, {'error': repr(e)},
                                       self.location_cache_duration)
            if self.redis.exists(geolocation_key):
                if self.redis.hexists(geolocation_key, 'error'):
                    raise ProviderException(
                        f'Unable to determine station geolocation: {self.redis.hget(geolocation_key, "error")}'
                    )
                lat = to_float(self.redis.hget(geolocation_key, 'lat'), 6)
                lon = to_float(self.redis.hget(geolocation_key, 'lon'), 6)
                if not name:
                    name = self.redis.hget(geolocation_key, 'name')

        alt_key = f'alt/{lat},{lon}'
        if not self.redis.exists(alt_key):
            try:
                elevation, is_peak = self.__compute_elevation(lat, lon)
                self.add_redis_key(alt_key, {
                    'alt': elevation,
                    'is_peak': is_peak
                }, self.location_cache_duration)
            except TimeoutError as e:
                raise e
            except UsageLimitException as e:
                self.add_redis_key(alt_key, {'error': repr(e)},
                                   self.usage_limit_cache_duration)
            except Exception as e:
                if not isinstance(e, ProviderException):
                    self.log.exception('Unable to call Google Elevation API')
                self.add_redis_key(alt_key, {'error': repr(e)},
                                   self.location_cache_duration)

        tz_key = f'tz/{lat},{lon}'
        if not tz and not self.redis.exists(tz_key):
            try:
                now = arrow.utcnow().timestamp
                result = requests.get(
                    f'https://maps.googleapis.com/maps/api/timezone/json?location={lat},{lon}'
                    f'&timestamp={now}&key={self.google_api_key}',
                    timeout=(self.connect_timeout, self.read_timeout)).json()

                if result['status'] == 'OVER_QUERY_LIMIT':
                    raise UsageLimitException(
                        'Google Time Zone API OVER_QUERY_LIMIT')
                elif result['status'] == 'INVALID_REQUEST':
                    raise ProviderException(
                        f'Google Time Zone API INVALID_REQUEST: {result.get("error_message", "")}'
                    )
                elif result['status'] == 'ZERO_RESULTS':
                    raise ProviderException(
                        'Google Time Zone API ZERO_RESULTS')

                tz = result['timeZoneId']
                dateutil.tz.gettz(tz)
                self.add_redis_key(tz_key, {'tz': tz},
                                   self.location_cache_duration)
            except TimeoutError as e:
                raise e
            except UsageLimitException as e:
                self.add_redis_key(tz_key, {'error': repr(e)},
                                   self.usage_limit_cache_duration)
            except Exception as e:
                if not isinstance(e, ProviderException):
                    self.log.exception('Unable to call Google Time Zone API')
                self.add_redis_key(tz_key, {'error': repr(e)},
                                   self.location_cache_duration)

        if not short_name:
            if self.redis.hexists(address_key, 'error'):
                if default_name:
                    short_name = default_name
                else:
                    raise ProviderException(
                        f"Unable to determine station 'short': {self.redis.hget(address_key, 'error')}"
                    )
            else:
                short_name = self.redis.hget(address_key, 'short')

        if not name:
            if self.redis.hexists(address_key, 'error'):
                if default_name:
                    name = default_name
                else:
                    raise ProviderException(
                        f"Unable to determine station 'name': {self.redis.hget(address_key, 'error')}"
                    )
            else:
                name = self.redis.hget(address_key, 'name')

        if not altitude:
            if self.redis.hexists(alt_key, 'error'):
                raise ProviderException(
                    f"Unable to determine station 'alt': {self.redis.hget(alt_key, 'error')}"
                )
            altitude = self.redis.hget(alt_key, 'alt')

        if self.redis.hexists(alt_key, 'error') == 'error':
            raise ProviderException(
                f"Unable to determine station 'peak': {self.redis.hget(alt_key, 'error')}"
            )
        is_peak = self.redis.hget(alt_key, 'is_peak')

        if not tz:
            if self.redis.hexists(tz_key, 'error'):
                raise ProviderException(
                    f"Unable to determine station 'tz': {self.redis.hget(tz_key, 'error')}"
                )
            tz = self.redis.hget(tz_key, 'tz')

        if not url:
            urls = {'default': self.provider_url}
        elif isinstance(url, str):
            urls = {'default': url}
        elif isinstance(url, dict):
            if 'default' not in url:
                raise ProviderException("No 'default' key in url")
            urls = url
        else:
            raise ProviderException('Invalid url')

        fixes = self.mongo_db.stations_fix.find_one(station_id)
        station = self.__create_station(provider_id, short_name, name, lat,
                                        lon, altitude, is_peak, status, tz,
                                        urls, fixes)
        self.stations_collection().update({'_id': station_id},
                                          {'$set': station},
                                          upsert=True)
        station['_id'] = station_id
        return station

    def create_measure(self,
                       for_station,
                       _id,
                       wind_direction,
                       wind_average,
                       wind_maximum,
                       temperature=None,
                       humidity=None,
                       pressure: Pressure = None,
                       rain=None):

        if all((wind_direction is None, wind_average is None,
                wind_maximum is None)):
            raise ProviderException('All mandatory values are null!')

        # Mandatory keys: json 'null' if not present
        measure = {
            '_id': int(round(_id)),
            'w-dir': self.__to_wind_direction(wind_direction),
            'w-avg': self.__to_wind_speed(wind_average),
            'w-max': self.__to_wind_speed(wind_maximum)
        }

        # Optional keys
        if temperature is not None:
            measure['temp'] = self.__to_temperature(temperature)
        if humidity is not None:
            measure['hum'] = to_float(humidity, 1)
        if pressure is not None and (pressure.qfe is not None or pressure.qnh
                                     is not None or pressure.qff is not None):
            measure['pres'] = self.__compute_pressures(
                pressure, for_station['alt'], measure.get('temp', None),
                measure.get('hum', None))
        if rain is not None:
            measure['rain'] = self.__to_rain(rain)

        measure['time'] = arrow.now().timestamp

        fixes = self.mongo_db.stations_fix.find_one(for_station['_id'])
        if fixes and 'measures' in fixes:
            for key, offset in fixes['measures'].items():
                try:
                    if key in measure:
                        fixed_value = measure[key] + offset
                        if key == 'w-dir':
                            fixed_value = fixed_value % 360
                        measure[key] = fixed_value

                except Exception:
                    self.log.exception(
                        f"Unable to fix '{key}' with offset '{offset}'".format(
                            key=key, offset=offset))

        return measure

    def has_measure(self, measure_collection, key):
        return measure_collection.find({'_id': key}).count() > 0

    def insert_new_measures(self, measure_collection, station, new_measures):
        if len(new_measures) > 0:
            measure_collection.insert(
                sorted(new_measures, key=lambda m: m['_id']))

            end_date = arrow.Arrow.fromtimestamp(
                new_measures[-1]['_id'], dateutil.tz.gettz(station['tz']))
            self.log.info(
                '--> {end_date} ({end_date_local}), {short}/{name} ({id}): {nb} values inserted'
                .format(end_date=end_date.format('YY-MM-DD HH:mm:ssZZ'),
                        end_date_local=end_date.to('local').format(
                            'YY-MM-DD HH:mm:ssZZ'),
                        short=station['short'],
                        name=station['name'],
                        id=station['_id'],
                        nb=str(len(new_measures))))

            self.__add_last_measure(measure_collection, station['_id'])

    def __add_last_measure(self, measure_collection, station_id):
        last_measure = measure_collection.find_one({
            '$query': {},
            '$orderby': {
                '_id': -1
            }
        })
        if last_measure:
            self.stations_collection().update({'_id': station_id},
                                              {'$set': {
                                                  'last': last_measure
                                              }})
class MongoDBConverter:
    def __init__(self):
        self.db = MongoClient(
            Settings.MONGO_CONNECTION_STRING)[Settings.REVIEWS_DATABASE]
        self.progess_bar = ProgressBar()

    def create_review_db(self):
        print_header("Creating Reviews")
        done = 0
        dataset_file = Settings.REVIEW_DATASET_FILE
        business_collection = self.db[Settings.BUSINESS_COLLECTION]

        # Find all the businesses and their reviews and add the review_id to a dict.
        review_id_hashes = set()

        businesses = business_collection.find()
        for business in businesses:
            if 'reviews' in business:
                for review in business['reviews']:
                    review_id_hashes.add(review['review_id'])

        self.progess_bar.start()
        with open(dataset_file, 'r') as dataset:
            count = sum(1 for _ in dataset)

        with open(dataset_file, 'r') as dataset:
            next(dataset)
            for line in dataset:
                try:
                    data = json.loads(line, encoding='utf-8')
                except ValueError:
                    print('Oops!')

                # Insert into DB
                if data["type"] == "review":
                    business_id = data['business_id']
                    business = business_collection.find_one(
                        {'business_id': business_id})

                    assert (business is not None)

                    add_review = True
                    if data['review_id'] in review_id_hashes:
                        add_review = False

                    business['reviews'] = business.get('reviews', [])
                    if add_review:
                        business['reviews'].append(data)
                        business_collection.update_one(
                            {'business_id': business_id}, {"$set": business},
                            upsert=True)

                done += 1
                self.progess_bar.print_progress(done, count)

    def add_business_data_collection(self):
        print_header("Adding Business Data")

        dataset_file = Settings.BUSINESS_DATASET_FILE

        add_businesses = True

        if Settings.BUSINESS_COLLECTION in self.db.collection_names():
            business_collection = self.db[Settings.BUSINESS_COLLECTION]
            if business_collection.count() > 0:
                add_businesses = False
                print("Data already present.... Skipping")

        if add_businesses:
            self.progess_bar.start()
            with open(dataset_file, 'r') as dataset:
                count = sum(1 for _ in dataset)

            business_collection = self.db[Settings.BUSINESS_COLLECTION]
            with open(dataset_file, 'r') as dataset:
                done = 0
                for line in dataset:
                    try:
                        data = json.loads(line, encoding='utf-8')
                    except ValueError:
                        print("Error in Business json file")

                    # Insert into DB
                    assert (data['type'] == 'business')
                    business_collection.insert(data)

                    done += 1
                    self.progess_bar.print_progress(done,
                                                    count,
                                                    prefix='Progress:',
                                                    suffix='Complete')

            business_collection.create_index('business_id')

    def add_user_data_collection(self):
        print_header("Adding User Data")
        dataset_file = Settings.USER_DATASET_FILE

        add_users = True

        if Settings.USER_COLLECTION in self.db.collection_names():
            user_collection = self.db[Settings.USER_COLLECTION]
            if user_collection.count() > 0:
                add_users = False
                print("Data already present.... Skipping")

        if add_users:
            self.progess_bar.start()
            with open(dataset_file, 'r') as dataset:
                count = sum(1 for _ in dataset)

            user_collection = self.db[Settings.USER_COLLECTION]
            with open(dataset_file, 'r') as dataset:
                done = 0
                for line in dataset:
                    try:
                        data = json.loads(line, encoding='utf-8')
                    except ValueError:
                        print("Error in Business json file")

                    # Insert into DB
                    assert (data['type'] == 'user')
                    user_collection.insert(data)

                    done += 1
                    self.progess_bar.print_progress(done,
                                                    count,
                                                    prefix='Progress:',
                                                    suffix='Complete')

            user_collection.create_index('user_id')
Exemple #29
0
class Monitor(object):
    """

    """
    name = 'slavem'

    WARNING_LOG_INTERVAL = datetime.timedelta(minutes=2)

    def __init__(self, email, host='localhost', port=27017, dbn='slavem', username=None, password=None, serverChan=None,
                 loggingconf=None, ):
        """
        :param host:
        :param port:
        :param dbn:
        :param username:
        :param password:
        :param serverChan:
        :param loggingconf: logging 的配置 Dict()
        """
        now = arrow.now()
        self.mongoSetting = {
            'host': host,
            'port': port,
            'dbn': dbn,
            'username': username,
            'password': password,
        }

        self.log = logging.getLogger()
        self.initLog(loggingconf)

        # serverChan 的汇报地址
        # self.serverChan = serverChan or {}
        # if self.serverChan:
        #     for account, url in self.serverChan.items():
        #         serverChanUrl = requests.get(url).text
        #         self.serverChan[account] = serverChanUrl
        # else:
        #     self.log.warning(u'没有配置 serverChan 的 url')

        self.email = EMail(serverChan=serverChan, **email)

        self.mongourl = 'mongodb://{username}:{password}@{host}:{port}/{dbn}?authMechanism=SCRAM-SHA-1'.format(
            **self.mongoSetting)

        self.__active = False
        self._inited = False

        # 下次查看是否已经完成任务的时间
        self.nextWatchTime = now

        # 下次检查心跳的时间
        self.nextCheckHeartBeatTime = now
        self.nextRemoveOutdateReportTime = now

        # 关闭服务的信号
        for sig in [signal.SIGINT,  # 键盘中 Ctrl-C 组合键信号
                    signal.SIGHUP,  # nohup 守护进程发出的关闭信号
                    signal.SIGTERM,  # 命令行数据 kill pid 时的信号
                    ]:
            signal.signal(sig, self.shutdown)

        self.authed = False

        # 定时检查日志中LEVEL >= WARNING
        self.threadWarningLog = Thread(target=self.logWarning, name='logWarning')
        self.lastWarningLogTime = now

        logMongoConf = loggingconf['handlers']['mongo']
        self.logDB = MongoClient(
            logMongoConf['host'],
            logMongoConf['port'],
        )[logMongoConf['database_name']]
        self.logDB.authenticate(logMongoConf['username'], logMongoConf['password'])

        # 初始化日志的 collection
        self.initLogCollection()

    def initLog(self, loggingconf):
        """
        初始化日志
        :param loggingconf:
        :return:
        """
        if loggingconf:
            # log4mongo 的bug导致使用非admin用户时,建立会报错。
            # 这里使用注入的方式跳过会报错的代码
            log4mongo.handlers._connection = MongoClient(
                host=loggingconf['handlers']['mongo']['host'],
                port=loggingconf['handlers']['mongo']['port'],
            )

            logging.config.dictConfig(loggingconf)
            self.log = logging.getLogger(self.name)

        else:
            self.log = logging.getLogger('root')
            self.log.setLevel('DEBUG')
            fmt = "%(asctime)-15s %(levelname)s %(filename)s %(lineno)d %(process)d %(message)s"
            # datefmt = "%a-%d-%b %Y %H:%M:%S"
            datefmt = None
            formatter = logging.Formatter(fmt, datefmt)
            sh = logging.StreamHandler(sys.stdout)
            sh.setFormatter(formatter)
            sh.setLevel('DEBUG')
            self.log.addHandler(sh)

            sh = logging.StreamHandler(sys.stderr)
            sh.setFormatter(formatter)
            sh.setLevel('WARN')
            self.log.addHandler(sh)
            self.log.warning(u'未配置 loggingconfig')

    @property
    def taskCollectionName(self):
        return 'task'

    @property
    def reportCollectionName(self):
        return 'report'

    @property
    def heartBeatCollectionName(self):
        return 'heartbeat'

    def dbConnect(self):
        """
        建立数据库链接
        :return:
        """
        try:
            # 检查链接是否正常
            self.mongoclient.server_info()
        except AttributeError:
            # 重新链接
            self.mongoclient = MongoClient(
                host=self.mongoSetting['host'],
                port=self.mongoSetting['port']
            )

            db = self.mongoclient[self.mongoSetting['dbn']]
            self.db = db
            if self.mongoSetting.get('username'):
                # self.mongoclient = pymongo.MongoClient(self.mongourl)
                self.authed = db.authenticate(
                    self.mongoSetting['username'],
                    self.mongoSetting['password']
                )

            self.reportCollection = db[self.reportCollectionName].with_options(
                codec_options=CodecOptions(tz_aware=True, tzinfo=LOCAL_TIMEZONE))

            self.tasksCollection = db[self.taskCollectionName].with_options(
                codec_options=CodecOptions(tz_aware=True, tzinfo=LOCAL_TIMEZONE))

            self.heartBeatCollection = db[self.heartBeatCollectionName].with_options(
                codec_options=CodecOptions(tz_aware=True, tzinfo=LOCAL_TIMEZONE))

    def init(self):
        """
        初始化服务
        :return:
        """
        self._inited = True

        # 建立数据库链接
        self.dbConnect()

        # 从数据库加载任务
        self.loadTask()

        # 对任务进行排序
        self.sortTask()

        # 最后更新任务时间
        self.refreshWatchTime()

    def _run(self):
        """

        :return:
        """
        # 下次任务时间
        self.reportWatchTime()

        while self.__active:
            time.sleep(1)
            # 检查任务启动
            self.doCheckTaskLanuch()

            # 检查心跳
            self.doCheckHeartBeat()

            # 删除过期的汇报
            self.removeOutdateReport()

    def doCheckHeartBeat(self):
        """

        :return:
        """
        now = arrow.now()
        if now >= self.nextCheckHeartBeatTime:
            # 心跳间隔检查是每5分钟1次
            self.nextCheckHeartBeatTime = now + datetime.timedelta(minutes=5)

            # 检查心跳

            cursor = self.heartBeatCollection.find({}, {'_id': 0})

            noHeartBeat = []
            for heartBeat in cursor:
                if now - heartBeat['datetime'] > datetime.timedelta(minutes=3):
                    # 心跳异常,汇报
                    noHeartBeat.append(heartBeat)
            try:
                if noHeartBeat:
                    self.noticeHeartBeat(noHeartBeat)
            except Exception as e:
                self.log.error(traceback.format_exc())

    def doCheckTaskLanuch(self):

        now = arrow.now()
        if now >= self.nextWatchTime:
            self.log.info(u'达到截止时间')

            # 检查任务
            self.checkTask()

            # 任务排序
            self.sortTask()

            # 最后更新任务时间
            self.refreshWatchTime()

            # 下次任务时间
            self.reportWatchTime()

    def reportWatchTime(self):
        """
        下次任务的时间
        :return:
        """
        now = arrow.now()
        if now < self.nextWatchTime:
            # 还没到观察下一个任务的时间
            rest = self.nextWatchTime - now
            self.log.info(u'下次截止时间 {}'.format(self.nextWatchTime))
            # time.sleep(rest.total_seconds())
            # self.log.info(u'达到截止时间')

    def start(self):
        """

        :return:
        """
        try:

            self.init()

            self.__active = True
            self.threadWarningLog.start()

            self._run()
            if self.threadWarningLog.isAlive():
                self.threadWarningLog.join()

        except Exception as e:
            err = traceback.format_exc()
            self.log.critical(err)
            title = u'slavem 异常崩溃'
            text = err
            self.sendEmail(title, text)
            self.stop()

    def stop(self):
        """
        关闭服务
        :return:
        """
        self.__active = False
        self.log.info(u'服务即将关闭……')
        time.sleep(1)

    def shutdown(self, signalnum, frame):
        """
        处理 signal 信号触发的结束服务信号
        :param signalnum:
        :param frame:
        :return:
        """
        self.stop()

    def __del__(self):
        """
        实例释放时的处理
        :return:
        """
        try:
            if self.authed:
                self.db.logout()
            self.mongoclient.close()
        except:
            pass

    def loadTask(self):
        """
        加载所有任务
        :return:
        """
        # 读取任务
        taskCol = self.tasksCollection
        taskList = []
        for t in taskCol.find():
            if not t.get('active'):
                continue
            t.pop('_id')
            taskList.append(Task(**t))

        self.tasks = taskList
        self.log.info(u'加载了 {} 个任务'.format(len(self.tasks)))
        if __debug__:
            for t in self.tasks:
                self.log.debug(str(t))

    def sortTask(self):
        """
        对任务进行排序
        :return:
        """
        self.tasks.sort(key=lambda x: x.deadline)

    def refreshWatchTime(self):
        """

        :return:
        """
        try:
            t = self.tasks[0]
            self.nextWatchTime = t.deadline
        except IndexError:
            # 如果没有任务,那么下次检查时间就是1分钟后
            self.nextWatchTime = arrow.now() + datetime.timedelta(seconds=60)
            return

    def checkTask(self):
        """
        有任务达到检查时间了,开始检查任务
        :return:
        """
        # 获取所有 deadline 时间到的任务实例

        taskList = []
        firstLanuchTime = None
        now = arrow.now()
        for t in self.tasks:
            assert isinstance(t, Task)
            if now >= t.deadline:
                taskList.append(t)
                try:
                    # 最早开始的一个任务
                    if firstLanuchTime < t.lanuchTime:
                        firstLanuchTime = t.lanuchTime
                except TypeError:
                    firstLanuchTime = t.lanuchTime

        self.log.info(u'查询启动报告时间 > {}'.format(firstLanuchTime))

        # 查询 >firstLanuchTime 的启动报告
        sql = {
            'datetime': {
                '$gte': firstLanuchTime,
            }
        }

        reportCol = self.reportCollection
        cursor = reportCol.find(sql)

        if __debug__:
            self.log.debug(u'查询到 {} 条报告'.format(cursor.count()))

        # 核对启动报告
        for report in cursor:
            try:
                for t in taskList:
                    assert isinstance(t, Task)
                    if t.isReport(report):
                        # 完成了,刷新deadline
                        self.log.info(u'{} 服务启动完成 {}'.format(t.name, t.lanuchTime))
                        if t.isLate:
                            # 迟到的启动报告, 也需要发通知
                            self.noticeDealyReport(t)
                        t.finishAndRefresh()
                        taskList.remove(t)
                        break
            except Exception:
                self.log.error(traceback.format_exc())

        # 未能准时启动的服务
        for t in taskList:
            if t.isTimeToNoticeDelay():
                self.noticeUnreport(t)
                t.refreshLastDelayNoticeTime()

            # 设置为启动迟到
            t.setLate()
            # 未完成,将 deadline 延迟到1分钟后
            t.delayDeadline()

    def noticeDealyReport(self, task):
        """

        :param task: tasks.Task
        :return:
        """
        # 通知:任务延迟完成了
        title = u'服务{name}启动迟到'.format(name=task.name)
        text = u'当前时间:{}'.format(arrow.now())

        for k, v in task.toNotice().items():
            text += u'\n{}\t:{}'.format(k, v)
        self.sendEmail(title, text)

    def noticeUnreport(self, task):
        """
        :param task: tasks.Task
        :return:
        """
        # 通知:未收到任务完成通知
        title = u'服务{name}未启动'.format(name=task.name)
        text = u'当前时间\t:{}'.format(arrow.now())

        for k, v in task.toNotice().items():
            text += u'\n{}\t:{}'.format(k, v)

        self.sendEmail(title, text)

    def noticeHeartBeat(self, noHeartBeats):
        # 通知:未收到任务完成通知
        title = u'心跳异常'
        text = u''
        for dic in noHeartBeats:
            text += u'=====================================\n'
            for k, v in dic.items():
                text += u'{}: {}\n'.format(k, v)
            shockSecs = arrow.now().datetime - dic['datetime']
            text += u'secs: {}\n'.format(shockSecs)
        self.sendEmail(title, text)

    def sendEmail(self, subject, text):
        """
        发送邮件通知
        :param subject:
        :param text:
        :return:
        """
        self.email.send(subject, text)

    def createTask(self, **kwargs):
        """
        创建任务
        :param kwargs:
        :return:
        """
        newTask = Task(**kwargs)

        sql = newTask.toSameTaskKV()
        dic = newTask.toMongoDB()

        self.tasksCollection.update_one(sql, {'$set': dic}, upsert=True)
        # self.db.task.find_one_and_update(sql, {'$set': dic}, upsert=True)
        self.log.info(u'创建了task {}'.format(str(dic)))

    def showTask(self):
        """

        :return:
        """
        for t in self.tasks:
            self.log.info(u'{}'.format(t.toMongoDB()))

    def removeOutdateReport(self):
        """
        :return:
        """
        now = arrow.now()
        if now >= self.nextRemoveOutdateReportTime:
            # 每天执行一次
            self.nextRemoveOutdateReportTime = now + datetime.timedelta(days=1)
            collection = self.reportCollection

            # 删除7天之前的
            deadline = now.datetime - datetime.timedelta(days=7)
            result = collection.remove({
                'datetime': {
                    '$lt': deadline
                }
            })
            num = result['n']
            self.log.info(u'清空了 {} 条启动报告'.format(num))

    def _logWarning(self):
        """
        遍历所有的日志,将最新的 warning 进行汇报
        :return:
        """

        now, self.lastWarningLogTime = self.lastWarningLogTime, arrow.now()

        colNames = self.logDB.collection_names()
        for colName in colNames:
            col = self.logDB[colName]
            sql = {
                'timestamp': {
                    '$gte': now.datetime,
                    '$lt': self.lastWarningLogTime.datetime,

                },
                'level': {
                    '$in': ["WARNING", "ERROR", "CRITICAL"]
                },
            }
            cursor = col.find(sql, {'_id': 0})
            count = cursor.count()
            if count == 0:
                # 没有查询到任何 warning 以上的日志
                continue

            logs = u'{} 共 {} 条\n'.format(now.datetime, count)
            for l in cursor.limit(10):
                logs += u'==================================\n'
                for k, v in l.items():
                    logs += u'{}: {} \n'.format(k, v)

            text = u'{}有异常日志'.format(colName)
            desp = logs
            self.sendEmail(text, desp)
            time.sleep(5)

    def logWarning(self):
        while self.__active:
            try:
                if arrow.now() - self.lastWarningLogTime < self.WARNING_LOG_INTERVAL:
                    # 每5分钟清查一次日志
                    time.sleep(1)
                    continue
                self._logWarning()
            except:
                err = traceback.format_exc()
                self.log.error(err)

    def initLogCollection(self):
        """
        初始化collection的索引
        :return:
        """
        indexTimestamp = IndexModel([('timestamp', ASCENDING)], name='timestamp', background=True)
        indexLevel = IndexModel([('level', DESCENDING)], name='level', background=True)
        indexes = [indexTimestamp, indexLevel]

        # 初始化日线的 collection
        for colName in self.logDB.collection_names():
            col = self.logDB[colName]
            self._initCollectionIndex(col, indexes)

    def _initCollectionIndex(self, col, indexes):
        """
        初始化分钟线的 collection
        :return:
        """

        # 检查索引
        try:
            indexInformation = col.index_information()
            for indexModel in indexes:
                if indexModel.document['name'] not in indexInformation:
                    col.create_indexes(
                        [
                            indexModel,
                        ],
                    )
        except OperationFailure:
            # 有索引
            col.create_indexes(indexes)
Exemple #30
0
    'Aug': '08',
    'Sep': '09',
    'Oct': '10',
    'Nov': '11',
    'Dec': '12'
}

for tweet in db.tweets.find():
    tweetDate = tweet["created_at"].split(" ")[5] + month[
        tweet["created_at"].split(" ")[1]] + tweet["created_at"].split(" ")[2]

    for hashtag in tweet["entities"]["hashtags"]:
        hashtext = hashtag["text"].lower()

        nameExist = False
        for name in db.collection_names():
            if name == hashtext:
                nameExist == True
        if nameExist == False:
            db[hashtext]  # create collection with hashtag as the name

        result = db[hashtext].find_one({"Date": tweetDate})
        if result == None:
            db[hashtext].insert_one({
                "_id": tweetDate,
                "Date": tweetDate,
                "count": 1
            })
        else:
            db[hashtext].update_one({"_id": tweetDate}, {"$inc": {"count": 1}})
 def startTest(self, test):
     db = MongoClient('localhost', settings.MONGO_DATABASE_PORT)[settings.MONGO_DATABASE_NAME]
     for collection_name in db.collection_names():
         if collection_name != 'system.indexes':
             getattr(db, collection_name).remove({})
Exemple #32
0
#Code for setting all intial values in a proper manner to start bot
import redis
import pickle
from pymongo import MongoClient
import datetime

#creating mongoclient where quotes are stored in my db
mydb = MongoClient('localhost')['goodread'] 

#Create a redis instance
red = redis.Redis()
red.set('per_day',120)
red.set('today_balance',0)
red.set('today_or_yesterday',datetime.datetime.today().timetuple().tm_mday)

#Storing category names in redis for future use
for i in mydb.collection_names():
	if i!='system.indexes':
		for cur in mydb[i].find():
			red.sadd(i,str(cur['_id']))
		red.sadd('cats',i)

#Creating a dictionary for mapping category to day in a month
days_cats = {i+1:k for i,k in enumerate(red.smembers('cats'))}

#Writing that mapping to a pickle file 
with open('dayscats.pickle','wb') as f:
	pickle.dump(days_cats,f)


Exemple #33
0
# Set up and configure app
# .................................
app = Flask(__name__, static_folder='static', static_url_path='')

# .................................
# Load configuration
# .................................
app.config.from_object(config.load())

# .................................
# MongoDB
# .................................
# configure mongo client (defaults to localhost:27017), and get 'masfotos' db
db = MongoClient()[app.config.get('MONGO_DB')]

if 'counters' not in db.collection_names():
	db.counters.save({
		'_id': 'share_key_seq',
		'seq': 0})

def get_next_seq(name):
	return db.counters.find_and_modify(
		query={'_id': name},
		fields={'_id': 0},
		update={'$inc': { 'seq': 1 }},
		new=True)['seq']


# use our custom serializer that normalizes _id as oid in session
app.session_interface.serializer = \
	utils.SimpleObjectIdReplacingSerializer(app.session_interface.serializer)
Exemple #34
0
from pymongo import MongoClient
db = MongoClient().testdb
#db.things.insert({'test':1,'4':[5,7]})
db.things.drop()

print(db.collection_names())
for doc in db.things.find():
    print(doc)
Exemple #35
0
class HRTDatabase:
    def __init__(self):
        self.database = MongoClient(os.environ['MONGODB_URI']).get_database(None)

    def removeOldGTFS(self, date):
        print "Removing Old GTFS Data"
        collection_prefix = self.genCollectionName('', date)
        for collection in self.database.collection_names():
            if collection.find('_') != -1 and (not collection.endswith(collection_prefix)):
                self.database.drop_collection(collection)

    def insertGTFS(self, data, date):
        self.insertData(self.genCollectionName('gtfs_', date), data)

    def insertStops(self, data, date):
        collection_name = self.genCollectionName('stops_', date)
        self.insertData(collection_name, data)
        self.database[collection_name].ensure_index([('location', GEO2D)])

    def insertRoutes(self, data, date):
        self.insertData(self.genCollectionName('routes_', date), data)

    def insertDestinations(self, data, date):
        self.insertData(self.genCollectionName('destinations_', date), data)

    def getStopName(self, stop_id, date):
        collection_name = self.genCollectionName('stops_', date)
        stop = self.database[collection_name].find_one({"stopId": stop_id})
        return stop['stopName']

    def getFinalStops(self, date):
        collection_name = self.genCollectionName('gtfs_', date)
        final_stops = self.database[collection_name].aggregate([
            {"$group": {
                "_id": "$trip_id",
                "stopId": {"$last": "$stop_id"},
                "sequence": {"$last": "$stop_sequence"}
            }}
        ])
        return final_stops

    def generateIndicesForGTFS(self, date):
        collection_name = self.genCollectionName('gtfs_', date)
        self.database[collection_name].create_index([
            ("block_id", ASCENDING)
        ], background=True)
        self.database[collection_name].create_index([
            ("block_id", ASCENDING),
            ("arrival_time", ASCENDING)
        ], background=True)
        self.database[collection_name].create_index([
            ("block_id", ASCENDING),
            ("actual_arrival_time", ASCENDING)
        ], background=True)
        self.database[collection_name].create_index([
            ("stop_id", ASCENDING),
            ("arrival_time", ASCENDING),
            ("actual_arrival_time", ASCENDING)
        ], background=True)
        self.database[collection_name].create_index([
            ("route_short_name", ASCENDING),
            ("stop_id", ASCENDING),
            ("direction_id", ASCENDING),
            ("arrival_time", ASCENDING)
        ], background=True)
        self.database[collection_name].create_index([
            ("route_short_name", ASCENDING),
            ("stop_id", ASCENDING),
            ("direction_id", ASCENDING),
            ("departure_time", ASCENDING)
        ], background=True)

    def genCollectionName(self, prefix, date):
        return prefix + date.strftime('%Y%m%d')

    def insertData(self, collection_name, data):
        if len(data) > 0:
            self.database[collection_name].remove()
            self.database[collection_name].insert_many(data)


    # get bus route mappings that are not more than 30 minutes old
    def getBusRouteMappings(self):
        mappings = {}
        for mapping in self.database['busRouteMappings'].find():
            if mapping['time'] > datetime.utcnow() + timedelta(minutes=-30):
                mappings[mapping['busId']] = mapping
        return mappings

    def setBusRouteMappings(self, mappings):
        self.database['busRouteMappings'].remove()
        if len(mappings) > 0:
            self.database['busRouteMappings'].insert(mappings)

    # return the last time to the minute that a bus checked in and
    # a list of all buses that checked in during that minute
    def getLastCheckinSummary(self):
        if self.database['checkins'].find().count() > 0:
            last_time = self.database['checkins'].find().sort("$natural", -1)[0]["time"]
            last_buses = self.database['checkins'].find({"time" : last_time}).distinct("busId")
            return {"time": last_time.replace(tzinfo=pytz.UTC), "busIds": last_buses}
        return None

    def updateCheckins(self, checkins):
        # purge checkins that are more than 2 hours
        self.database['checkins'].remove({"time": {"$lt": datetime.utcnow() + timedelta(hours=-2)}})
        if len(checkins) > 0:
            self.database['checkins'].insert(checkins)

    def getRealTimeArrivalUpdates(self, checkin):
        checkin_local_time = checkin.time + timedelta(hours=-5)
        collection_name = 'gtfs_' + checkin_local_time.strftime('%Y%m%d')
        stop_times = self.database[collection_name].find({
            'block_id': checkin.blockId,
            '$or': [
                {'arrival_time': {
                    '$gte': datetime.utcnow() + timedelta(minutes=-5-checkin.adherence),
                    '$lte': datetime.utcnow() + timedelta(minutes=30-checkin.adherence)
                }},
                {'actual_arrival_time': {
                    '$gte': datetime.utcnow() + timedelta(minutes=-5),
                    '$lte': datetime.utcnow() + timedelta(minutes=30)
                }}
            ]
        }, {'arrival_time': 1, 'actual_arrival_time': 1})
        updates = []
        for stoptime in stop_times:
            new_arrival_time = stoptime['arrival_time'] - timedelta(minutes=checkin.adherence)
            if 'actual_arrival_time' not in stoptime or new_arrival_time != stoptime['actual_arrival_time']:
                updates.append(UpdateOne(
                    {'_id': stoptime['_id']},
                    {'$set': {'actual_arrival_time': new_arrival_time}}
                ))
        return (collection_name, updates)

    def updateRealTimeArrivals(self, updates):
        for collection_name in updates:
            if updates[collection_name]:
                result = self.database[collection_name].bulk_write(updates[collection_name])
                #print result.bulk_api_result

    def getScheduledStop(self, checkin):
        checkin_local_time = checkin.time + timedelta(hours=-5)
        collection_name = 'gtfs_' + checkin_local_time.strftime('%Y%m%d')
        scheduled_stop = self.database[collection_name].find_one({
            "route_short_name" : checkin.routeShortName,
            "stop_id": checkin.stopId,
            "direction_id": {"$ne": checkin.direction},
            "$or": [
                {"arrival_time": {
                    "$gte": checkin.time + timedelta(minutes=checkin.adherence - 2),
                    "$lte": checkin.time + timedelta(minutes=checkin.adherence + 2)
                }},
                {"departure_time": {
                    "$gte": checkin.time + timedelta(minutes=checkin.adherence - 2),
                    "$lte": checkin.time + timedelta(minutes=checkin.adherence + 2)
                }}
            ]
        })
        if scheduled_stop is None:
            print "No scheduled stop found for the following checkin in {0}".format(collection_name)
            print checkin.__dict__
            return None

        # get the stop sequence that OneBusAway uses
        scheduled_stop['stop_sequence_OBA'] = self.database[collection_name].find({
            "trip_id": scheduled_stop["trip_id"],
            "stop_sequence": {"$lt": scheduled_stop["stop_sequence"]}
        }).count()

        return scheduled_stop
Exemple #36
0
    for fileName in fileNames:
        if not dirPath in ignoreDirs:
            entry = os.path.normpath(os.path.join(dirPath, fileName))
            fileList[entry] = False

##### Connect to the Database #####
db = MongoClient(connString)
for database in db.database_names():
    if database != "admin" and database != "local" and database != "notifications":
        db = MongoClient(connString)[database]
        if verbose:
            print("--database:" + database)

##### Get a model ID and find entries #####
        regex = re.compile(".+\.ref$")
        for colName in db.collection_names():
            result = regex.match(colName)
            if result:
                if verbose:
                    print("\t--collection:" + colName)
                for refEntry in db[colName].find({"type": "fs"}):
                    filePath = os.path.normpath(
                        os.path.join(localFolder, refEntry['link']))
                    inIgnoreDir = bool(
                        [x for x in ignoreDirs if filePath.find(x) + 1])
                    if not inIgnoreDir:
                        fileStatus = fileList.get(filePath)
                        if fileStatus == None:
                            refInfo = database + "." + colName + ": " + refEntry[
                                "_id"]
                            if dryRun:
Exemple #37
0
from pymongo import MongoClient
db = MongoClient().testdb
#db.things.insert({'test':1,'4':[5,7]})
db.things.drop()

print (db.collection_names())
for doc in db.things.find():
    print (doc)
Exemple #38
0
class Output():
    def __init__(self, ip, port, db_name):
        self.db_name = db_name
        self.db = MongoClient(ip, port)[db_name]
        self.db_common = MongoClient(ip, port)['common']
        self.collections = self.db.collection_names()

    def write_dialogue(self):
        dirpath = self.db_name + '_dialogue_data'
        if os.path.exists(dirpath):
            shutil.rmtree(dirpath)
        os.mkdir(dirpath)
        data = {}
        for x in self.db['dialogue'].find():
            key = x['intention']
            data[key] = data.setdefault(key, []) + list(
                map(lambda q: x['super_intention'] + q, x['equal_questions']))
        for k in data.keys():
            f = open(os.path.join(dirpath, k), 'w', encoding='utf-8')
            for i in data[k]:
                f.write(i + '\n')
            f.close()

    def write_topic_dialogue(self, filepath):
        data = [x['equal_questions'] for x in self.db['dialogue'].find()]
        f = open(filepath, 'w', encoding='utf-8')
        for d in set(sum(data, [])):
            f.write(d + '\n')
        f.close()

    def write_topic_common(self, dirpath):
        data = [x['question'] for x in self.db_common['repeat_guest'].find()]
        f = open(os.path.join(dirpath, 'repeat_guest'), 'w', encoding='utf-8')
        for d in set(data):
            f.write(d + '\n')
        f.close()
        data = [x['question'] for x in self.db_common['repeat_machine'].find()]
        f = open(os.path.join(dirpath, 'repeat_machine'),
                 'w',
                 encoding='utf-8')
        for d in set(data):
            f.write(d + '\n')
        f.close()
        data = [
            x['equal_questions'] for x in self.db_common['interaction'].find()
        ]
        f = open(os.path.join(dirpath, 'interaction'), 'a', encoding='utf-8')
        for d in set(sum(data, [])):
            f.write(d + '\n')
        f.close()

    def write_topic_collection(self, filepath, doc_name):
        data = [x['equal_questions'] for x in self.db[doc_name].find()]
        f = open(filepath, 'w', encoding='utf-8')
        for d in set(sum(data, [])):
            f.write(d + '\n')
        f.close()

    def write_topic(self):
        dirpath = self.db_name + '_topic_data'
        if os.path.exists(dirpath):
            shutil.rmtree(dirpath)
        os.mkdir(dirpath)
        for collection in self.collections:
            if collection == 'dialogue':
                self.write_topic_dialogue(os.path.join(dirpath, 'dialogue'))
            elif collection in ['greeting', 'qa', 'sale', 'interaction']:
                self.write_topic_collection(os.path.join(dirpath, collection),
                                            collection)
        self.write_topic_common(dirpath)
Exemple #39
0
class PubMedCrawler:
    """
        This class allow to search and collect data from PubMed:
        1) .search(query)       -> grab all pmids involved in a query, return a "query_name"
        2) .collect(query_name) -> save abstract+meta of pmids previously finded
    """
    def __init__(self):
        Entrez.email = "*****@*****.**"
        Entrez.tool = "tool_name"
        self.db = MongoClient()['pubmed']

    def search(self, query, reldate='', refresh=False, warning=True):
        """ Search ids related to query terms and returned if:
                1) already cached OR
                2) queried to Entrez and cached
        :param query: must be a string with blank spaced words (es: 'gene protein')
        :param reldate: docs returned are no more older than this (in days), leave blank('') to not set
        """
        query_name = '_'.join(query.split())
        ids_name = 'q.' + query_name + '.ids'
        # 0. Query cached? Skip
        if self.db[ids_name].count() == 0 or refresh:
            proceed = 'y'
            # 1. How many entries?
            result = self._esearch(query, reldate)
            # 2. Report
            result_count = int(result['Count'])
            n_query = (result_count / 100000) + 1   # integer division + rest
            if warning:
                # 3. Many result? Ask before proceed
                print '{0} IDs found. It will take {1} queries to save them all.'.format(result_count, n_query)
                proceed = raw_input("Press 'y' to continue: ").lower()
            if proceed == 'y':
                # 4. Query and saving entries
                print 'Saving {} IDs...'.format(result_count)
                self.db[ids_name].drop()      # drop if exists (maybe a previous query)
                for n in range(0, n_query):     # sliding window to collect more than 100k data
                    result = self._esearch(query, reldate, retstart=n*100000, retmax=100000)
                    self.db[ids_name].insert_many([{'_id': i} for i in result['IdList']], bypass_document_validation=True)
                time.sleep(1)   # 1 sec of sleep, just to be safe...
                print '...Done!'
        # return (query name, cursor to collected pmid)
        return query_name, self.db[ids_name].find()

    @staticmethod
    def _esearch(query, reldate='', retstart=0, retmax=0):
        """ Envelope and ESearch query
        :param retmax=0 -> by default it return only a result['Count'] result
        :return result['Count'] = # record found | result['IdList'] = ['123',...]
        """
        handle = Entrez.esearch(db='pubmed', term=query, retmode='xml',
                                reldate=reldate, retstart=retstart, retmax=retmax)
        result = Entrez.read(handle)
        handle.close()
        return result

    def collect(self, query_name, from_chunk=0, refresh=False, warning=True):
        """ Save abstract+meta from a previously searched query """
        ids_name = 'q.' + query_name + '.ids'
        # 0. Query exists?
        if ids_name not in self.db.collection_names():
            print "No query found. Use search() before collect."
            return None
        # 1. Query cached? Skip
        collect_name = 'q.' + query_name + '.data'
        #2. Many ids? Ask before proceed
        if warning:
            print '{0} IDs found.'.format(self.db[ids_name].count())
            proceed = raw_input("Press 'y' to continue: ").lower()
        if proceed == 'y':
            # 3. Query and collect data
            if refresh: self.db[collect_name].drop()  # drop if exists (maybe a previous query)
            # 4. if specified, resume from chunk requested (keep track of "n" from console print)
            ids_chunks = self._chunk(self.db[ids_name].find(no_cursor_timeout=True).skip(from_chunk*10000), 10000)
            for n, chunk in enumerate(ids_chunks, from_chunk):
                naked_ids = [d['_id'] for d in chunk]
                print 'Chunk {0} > saving {1} articles: {2} ~ {3} ...'.format(
                            n, len(naked_ids), naked_ids[0], naked_ids[-1])
                result = self._efetch(naked_ids, retmax=10000)
                # 4. If article can't be read, skip
                result_docs = [doc for doc in [self._article2json(r) for r in result] if doc is not None]
                self.db[collect_name].insert_many(result_docs, bypass_document_validation=True)
                time.sleep(1)   # 1 sec of sleep, just to be safe...
            print '...Done!'
        # return (query name, cursor to collected data)
        return query_name, self.db[collect_name].find()

    @staticmethod
    def _article2json(article):
        try:
            json = {'_id': str(article['MedlineCitation']['PMID'])} # if raise error, skip article
        except KeyError: return None
        try:
            json['title'] = article['MedlineCitation']['Article']['ArticleTitle']
        except KeyError: pass
        try:
            json['abstract'] = ' '.join(article['MedlineCitation']['Article']['Abstract']['AbstractText'])
        except KeyError: pass
        try:
            json['date'] = datetime.datetime(int(article['MedlineCitation']['DateCreated']['Year']),
                                             int(article['MedlineCitation']['DateCreated']['Month']),
                                             int(article['MedlineCitation']['DateCreated']['Day']))
        except ValueError or KeyError: pass
        try:
            json['authors'] = [a['LastName'] + ',' + a['ForeName']
                                    for a in article['MedlineCitation']['Article']['AuthorList']]
        except KeyError: pass
        try:
            json['journal'] = article['MedlineCitation']['Article']['Journal']['Title']
        except KeyError: pass
        try:
            json['keywords'] = [kws.encode('utf-8') for kws in article['MedlineCitation']['KeywordList'][0]]
        except IndexError or KeyError: pass
        try:
            json['mesh'] = [mesh['DescriptorName'].encode('utf-8') for mesh in article['MedlineCitation'][
                'MeshHeadingList']]
        except KeyError: pass
        return json

    @staticmethod
    def _chunk(cursor, size):
        """ Collect data into fixed-sized chunks or blocks until exhaust """
        for it in xrange(0, cursor.count(), size):
            chunk = list()
            for n in xrange(it, it+size):
                try:
                    chunk.append(next(cursor))
                except StopIteration:
                    yield chunk # final chunk
                    cursor.close()
                    raise StopIteration
            yield chunk

    @staticmethod
    def _efetch(ids, retstart=0, retmax=0):
        """ Envelope and EFetch query
        :param ids must be a list (single query ['25683065']
        :param retmax=0 -> by default it return nothing, always set to # of result required
        :return
        result[0]['MedlineCitation'] -> get first article obj
            ...['PMID'] -> pmid: apply str()
            ...['DateCompleted'] + ['Year'] | ['Month'] | ['Day']: apply datetime.datetime()
            ...['Article']['ArticleTitle'] -> article title
                       ...['Journal']['Title'] -> journal title
                       ...['Abstract']['AbstractText'] -> list of string composing abstract
                       ...['AuthorList'] -> list of author
                                    ...[0]['LastName'] -> surname
                                    ...[0]['ForeName'] -> name
            OPT:
            ...['KeywordList'] -> list of string keywords (arguments): apply str(k)
            ...['MeshHeadingList'] -> list of l['DescriptorName']: apply str() (ignoring QualifierName)
        """
        url_ids = ','.join(ids)
        handle = Entrez.efetch(db='pubmed', id=url_ids, retmode='xml', retstart=retstart, retmax=retmax)
        result = Entrez.read(handle)
        handle.close()
        return result
from pymongo import MongoClient
from datetime import datetime

db = MongoClient("mongodb://localhost:27017/")['test']
collections = db.collection_names()
for coll in collections:
    first_date = datetime.strptime(db[coll].find().limit(1)[0]['date'], '%Y/%m/%d').date()
    last_date = datetime.strptime(db[coll].find().skip(db[coll].count() - 1)[0]['date'], '%Y/%m/%d').date()
    number_of_days_between_dates = (last_date - first_date).days + 1
    number_of_days_in_record = db[coll].count()
    percent_complete = float(number_of_days_in_record)/float(number_of_days_between_dates)*100
    print 'Customer: ' + coll + ', First: ' + str(first_date) + ', Last: ' + str(last_date) + ', Percentage complete: '\
          + str(("{0:.0f}".format(round(percent_complete)))) + '%'
Exemple #41
0
        game = discord.Game(f"{self.config.prefix}help for help!")
        await self.change_presence(status=discord.Status.idle, activity=game)
        print("Resumed.")


# Connects to MongoDB

with open("./config.json", "r", encoding="utf8") as file:
    data = json.dumps(json.load(file))
    config = json.loads(data,
                        object_hook=lambda d: recordclass("config", d.keys())
                        (*d.values()))

db_client = MongoClient(config.uri)[config.db]
try:
    db_client.collection_names()
except Exception:
    db_client = None
    logger.warning(
        "MongoDB connection failed. There will be no MongoDB support.")


def _prefix_callable(bot, msg):
    base = [f"<@!{bot.user.id}> ", f"<@{bot.user.id}> "]

    try:
        db = Mongo(db_client, "guilds")
        guild_db = db.find(str(msg.guild.id))
        if not msg.guild:
            base.append(config.prefix)
        elif not guild_db:
Exemple #42
0
class Finder(object):
    def __init__(self, uri, db, verbose=True, timeout=3, **kwargs):
        self.uri = uri
        self.verbose = verbose
        self.timeout = timeout
        try:
            self.conn = MongoClient(self.uri, **kwargs)[db]
            print("Connected to {conn}".format(conn=self.conn))
        except pymongo.errors.ConnectionFailure as e:
            print("Couldn't connect to MongoDB: {e}".format(e=e))
        self.filters = defaultdict(list)
        self.sort_props = {}
        self.config = {}

    def _get_suggestions(self, cmd):
        return [method[len('do_'):]
                for (method, _) in inspect.getmembers(self)
                if cmd in method and method.startswith('do_')]

    def _print_suggestions(self, cmd):
        suggs = self._get_suggestions(cmd)
        if len(suggs) == 0:
            print("I don't understand ¯\(°_o)/")
            return
        print("Do you mean?:")
        for sugg in suggs:
            print("    " + sugg)

    def query_filters(self):
        """Parse currently stored filter values into a pymongo query dict"""
        def filter_value(val):
            if len(val) == 1:
                filter_val = val[0]
                is_regex = re.match(r"/([^/]+)/", filter_val)
                if is_regex:
                    return {"$regex": is_regex.groups()[0]}
                else:
                    return filter_val
            if len(val) > 1:
                return {"$in": val}
        return {f: filter_value(val) for (f, val) in self.filters.items()}

    def parse(self, text):
        cmd, *args = text.split()  # this is python3 only (I think)
        try:
            method = getattr(self, "do_" + cmd)
            return method(args)

        except AttributeError as e:
            self._print_suggestions(cmd)
            return

        except pymongo.errors.AutoReconnect as e:
            print("Reconnecting... in %d seconds" % self.timeout)
            time.sleep(self.timeout)
            return
        # AutoReconnect is a subclass of ConnectionFailure
        except pymongo.errors.ConnectionFailure as e:
            print("MongoDB Exception {te}: {e}".format(te=type(e), e=e))

    def add_sort(self, prop, order='des'):
        self.sort_props[prop] = order

    def clear_sort_props(self):
        self.sort_props = {}

    def get_filter(self, filter_key):
        return self.filters[filter_key]

    def add_to_filter(self, filter_key, filter_val):
        self.filters[filter_key].append(filter_val)
        if self.verbose:
            pprint(self.query_filters())

    def reset_filter(self, filter_key, filter_val):
        self.filters[filter_key] = [filter_val]
        if self.verbose:
            pprint(self.query_filters())

    def clear_filters(self):
        print("Cleared all filters")
        self.filters = defaultdict(list)

    def clear_filter(self, filter_key):
        if filter_key in self.filters:
            print("Cleared filter {f}".format(f=filter_key))
            del self.filters[filter_key]
        if self.verbose:
            pprint(self.query_filters())

    """
    API functions. First docstr line is used for in-line autocompletion help.
    API-methods must start with do_ to be picked by the CLI-parser.
    Each API-method gets passed a list of 0 or more arguments.
    Argument validation must be done via one of:
       assert custom_assertion, "Informative assertion message",
         It checks for a custom assertion and displays a messsage to the user.
       raise ParseError(given_value, expected_value),
         Custom exception class. This is useful for fine-grained exception
         handling in case of methods that accept arglists (e.g. do_filter).
    API-methods can return None or a (possibly nested) generator of
    (prompt_text, input_callback), such that the prompt_text is shown to the
    user and the input_callback is called on the user's response.
    Note that input_callback can itself return another generator that will be
    called on the user response to the first prompt (thereby allowing for
    recursive workflows - see `do_filter` for an example).
    """

    def do_show(self, args):
        """
        Show the current value of a particular settings (e.g. filters).
        """
        vals = ('filters', 'sort')
        assert len(args) == 1, "Specify at least one value"
        assert args[0] in vals, "Specify one of {vals}".format(vals=vals)
        if args[0] == 'filters':
            for f in self.filters:
                fs = ', '.join(self.filters[f])
                print('    {f} => {fs}'.format(f=f, fs=fs))
        elif args[0] == 'sort':
            for sort_prop, order in self.sort_props.items():
                print('    {s}[{o}]'.format(
                    s=sort_prop,
                    o={'asc': 'ascending', 'des': 'descending'}[order]))

    def do_filter(self, args):
        """
        Add a filter to the query.
        Multiple filters can be added using the following syntax:
          filter key1:value key2:value ...
        Input a key to clear the filter for that key:
          filter username
        Input no arguments to clear all filters:
          filter
        Possible keys are [username, corpus, query]
        """

        ALREADY_EXISTS = \
            "A value for filter {key} already exists, " + \
            "do you want to (o)verwrite or (c)oncatenate?"

        def callback(res):
            if res == 'o':
                self.reset_filter(key, val)
            elif res == 'c':
                self.add_to_filter(key, val)
            else:
                yield "Please answer (o,c)", callback

        if not args:
            self.clear_filters()
            return

        for arg in args:
            if ":" not in arg:  # assume arg is filter_key
                self.clear_filter(arg)
                return
            try:                # assume arg is key:val
                key, val = arg.split(":")
                f = self.get_filter(key)
                if not f:
                    self.add_to_filter(key, val)
                if val in f:
                    continue
                else:
                    yield ALREADY_EXISTS.format(key=key), callback
            except ValueError:
                raise(ParseError(arg, "key:value"))

    def do_sort(self, args):
        """
        Add sort criteria to the query output.
        Multiple sort criteria can be added following the pattern:
          sort field1:asc field2:des
        Specify `asc` or `des` for ascending or descending order.
        Order defaults to descending order.
        Possible value for field are [timestamp, username, corpus].
        """
        orders = ('asc', 'des')
        for arg in args:
            try:
                prop, order = arg.split(':')
                assert order in orders, "Sort order must be in " + str(orders)
                self.add_sort(prop, order=order)
            except ValueError:
                self.add_sort(arg)

    def do_reconnect(self, args):
        """
        Refresh MongoClient connection.
        """
        raise pymongo.errors.AutoReconnect()

    def do_config(self, args):
        """
        Not implemented yet
        Change session configuration.
          Values:
            page_size: type int, default 10
        """
        raise NotImplementedError()

    def do_exit(self, args):
        """
        Exit the application.
        """
        print("See you soon!")
        sys.exit(0)

    def do_help(self, args):
        """
        Show help. Takes a command and display corresponding info.
        Example:
          help filter
        """
        if len(args) == 0:
            print("Please, specify a command.")
            return
        try:
            cmd, *rest = args   # ignore rest arguments
            print("   " + getattr(self, "do_" + cmd).__doc__.strip())
        except AttributeError:
            self._print_suggestions(cmd)

    def get_coll_names(self):
        for coll_name in self.conn.collection_names():
            if coll_name.startswith('_') and coll_name != "_vcs":
                yield coll_name

    def get_project_name(self, coll_name):
        return coll_name[1:]

    def get_project_names(self):
        for coll_name in self.get_coll_names():
            yield self.get_project_name(coll_name)

    def get_project(self, project):
        return self.conn['_' + project]

    def get_projects(self):
        for project_name in self.get_project_names():
            yield self.get_project(project_name)

    def do_projects(self, args):
        """
        Show information about projects
          projects: show existing projects
        """
        if len(args) == 0:
            for project_name in self.get_project_names():
                print(project_name)

    def groupcounts(self, project, groupkeys):
        result = []
        cursor = project.aggregate(
            [{"$match": self.query_filters()},
             {"$group": {"_id": {k: "$" + k for k in groupkeys},
                         "count": {"$sum": 1}}}])
        for row in cursor:      # transform mongodb cursor output
            row_dict = {}
            row_dict['count'] = row['count']
            for key, value in row["_id"].items():
                assert key != "count", "Found count field in groupby keys"
                row_dict[key] = value
            result.append(row_dict)
        return result

    def simplecounts(self, project):
        return project.find(self.query_filters()).count()

    def do_count(self, args):
        """
        Count annotations using the current filters.
        Specify a project to get only counts for that project:
          count myProject
        Multiple projects can be specified with commas:
          count myProject,projectTest,otherProject
        """
        assert args, "Specify a project (e.g. GET) or 'all' for all projects"

        if self.verbose:
            pprint(self.query_filters())

        # parse arguments
        project, *rest = args
        parsed_args = parse_rest(rest, {'groupby': ['key1,key2,etc'],
                                        'output':  ['filename']})

        # gather projects
        if project == 'all':
            projects = self.get_project_names()
        else:
            projects = project.split(',')

        by_project = {}
        for project_name in projects:
            project = self.get_project(project_name)

            # compute counts
            if 'groupby' in parsed_args:
                groupkeys = parsed_args['groupby']['key1,key2,etc'].split(',')
                counts = self.groupcounts(project, groupkeys)
                if counts:
                    by_project[project_name] = counts
                else:
                    print("Empty results for project [%s]" % project_name)
            else:
                by_project[project_name] = self.simplecounts(project)

        # display output
        if 'output' in parsed_args:
            outfile = parsed_args['output']['filename']
            ext = get_extension(outfile)
            if ext == 'csv':
                if 'groupby' in parsed_args:
                    writers.csv_count_group(by_project, outfile)
                else:
                    writers.csv_count(by_project, outfile)
            else:
                raise ValueError("Unrecognized extension [%s]" % ext)
        else:
            if 'groupby' in parsed_args:
                writers.print_count_group(by_project)
            else:
                writers.print_count(by_project)
Exemple #43
0
#
# Generates a data summary for a given table that follows the schema in schemas.txt.
#
# Arguments: A list of groups to build indexes for.
# Ex: To build a table for clean_data and all_data, the command should be:
# python generate-indexes.py all clean
#

if len(sys.argv) <= 1:
    print("Error: Please enter a collection to index!")
    print("Aborting.")
    sys.exit()

db = MongoClient().viraldb
collections = db.collection_names()

for collection in sys.argv[1:]:
    collection_data = collection + '_data'
    collection_indexes = collection + '_indexes'

    if 'data' in collection:
        print("Please do not include 'data' in the collection...")
        print("Skipping.")
        continue
    elif 'index' in collection:
        print("Please do not include 'index' in the collection...")
        print("Skipping.")
        continue
    elif not collection_data in collections:
        print("Collection specified not found!")
Exemple #44
0
import ConfigParser
import random  # 导入random
from pymongo import MongoClient  # 导入Mongoclient

# 连接mongodb,增insert,save删drop,remove改update查find
# config = ConfigParser.RawConfigParser()
# config.read(r'C:\Users\ieware\pyexercises\august.conf')  # r不解析\

# try:
#     dbhost = config.get('base', 'dbhost')
# except NoSectionError, e:
#     print e

db = MongoClient(dbhost).august  # 连接August数据库
# db.authenticate # 用户认证
print db.collection_names()  # show collections查询所有聚合名称
db.user.count()  # 统计user聚合中数据数量
db.user.drop()  # 清空user聚合
db.user.save({'id': 1, 'name': 'nana', 'age': '23'})  # 插入一个数据

# 插入多个数据
for id in range(2, 10):
    name = random.choice(['lily', 'tom', 'jack'])
    age = random.choice(['23', '34', '58'])
    db.user.insert({'id': id, 'name': name, 'age': age})

users = db.user.find()  # 查询user聚合下所有数据
db.user.update({'age':'23'},{'$set':{'age':'33'}},upsert=False,multi=False)  # 更改第一个'age'为'33',upsert不存在则会插入,multi批量更新
db.user.remove({'id':1})  # 删除'id'为1的数据
for x in users:
    print x['age'], x['name'], x['id']