def get_user_commits(username, dbname='ansibot', collection='api_commits'): '''Retrieve all commits authored by username''' client = MongoClient() mongo_db = getattr(client, dbname) mongo_collection = getattr(mongo_db, collection) pipeline = [ { '$match': {'author.login': username}, }, { '$project': { '_id': 0, 'author.login': 1, 'commit.committer.date': 1, 'sha': 1 } }, {'$sort': {'commit.committer.date': 1}} ] cursor = mongo_collection.aggregate(pipeline) commits = list(cursor) client.close() return commits
def yearStatistics(request): #Código de mongodb client = MongoClient(ip, port) db = client['twitterdata'] tweetCollection = db['tweets'] #Tweets por meses result = tweetCollection.map_reduce(mapCuentaMeses, reduce,"myresult6") for doc in result.find(): result.update({'_id':doc['_id']}, {'$set': {'date':doc['_id']}}) anio = result.find(limit=10).sort('date',1) anioArray = [] for a in anio: anioArray.append(a['value']) anio = result.find(limit=10).sort('date',1) # Retweets result2 = tweetCollection.map_reduce(mapCuentaMesesRT, reduce,"myresult6") for doc in result2.find(): result2.update({'_id':doc['_id']}, {'$set': {'date':doc['_id']}}) anioRT = result2.find(limit=10).sort('date',1) anioArrayRT = [] for a in anioRT: anioArrayRT.append(a['value']) anioRT = result2.find(limit=10).sort('date',1) client.close() #return the template return render_to_response('yearStatistics.html', locals())
def load_20_news_group(): """ Loads the 20 news group corpus into a mongo database """ mc = MongoClient() db = mc["astrology"] coll_name = "corpora.twenty_news_group" meta_coll_name = "corpora.twenty_news_group.meta" # Drop if already exists db.drop_collection(coll_name) db.drop_collection(meta_coll_name) coll = db[coll_name] meta_coll = db[meta_coll_name] labels = set() for batch in get_20_news_group(300, labels): coll.insert_many(batch) meta_doc = {"labels": list(labels)} meta_coll.insert_one(meta_doc) coll.create_index("label") mc.close()
def get_user_issues(username, dbname='ansibot', collection='api_issue'): '''Retrieve all commits authored by username''' client = MongoClient() mongo_db = getattr(client, dbname) mongo_collection = getattr(mongo_db, collection) pipeline = [ { '$match': {'user.login': username}, }, { '$project': { '_id': 0, 'user.login': 1, 'created_at': 1, 'html_url': 1 } }, {'$sort': {'created_at': 1}} ] cursor = mongo_collection.aggregate(pipeline) issues = list(cursor) client.close() return issues
def run(host,database,graphname): # Create an empty response object. response = {} collectionNames = [] # this method traverses the documents in the selected graph collection and builds a JSON object # that represents the graph to the application. It might be faster to adopt to using a standard # networkX JSON description, but this is certainly simple and flexible for an initial prototype. client = MongoClient(host, 27017) db = client[database] # get a list of all collections (excluding system collections) collection = db[graphname] # loop through the records in the network and take the appropriate action for each type nodecount = collection.find({'type':'node'}).count() edgecount = collection.find({'type':'link'}).count() # Pack the results into the response object, and return it. response['result'] = {} response['result']['nodes'] = nodecount response['result']['links'] = edgecount client.close() # Return the response object. #tangelo.log(str(response)) return json.dumps(response)
def lookup_phenotype_results_by_id(id_list: list): client = MongoClient(util.mongo_host, util.mongo_port) db = client[util.mongo_db] obj = dict() obj['results'] = list() obj['indexes'] = dict() try: # db.phenotype_results.find({"_id": { $in: [ObjectId("5b117352bcf26f020e392a9c"), ObjectId("5b117352bcf26f020e3926e2")]}}) # TODO TODO TODO ids = list(map(lambda x: ObjectId(x), id_list)) res = db.phenotype_results.find({ "_id": { "$in": ids } }) obj['results'] = list(res) n = 0 for o in obj['results']: id = str(o['_id']) obj['indexes'][id] = n n = n + 1 except Exception as e: traceback.print_exc(file=sys.stdout) obj['success'] = False finally: client.close() return obj
def phenotype_subjects(job_id: str, phenotype_final: bool): client = MongoClient(util.mongo_host, util.mongo_port) db = client[util.mongo_db] results = [] # db.phenotype_results.aggregate([ {"$match":{"job_id":{"$eq":10201}, "phenotype_final":{"$eq":true}}}, # {"$group" : {_id:"$subject", count:{$sum:1}}} ]) try: q = [ { "$match": { "phenotype_final": { "$eq": phenotype_final }, "job_id": { "$eq": int(job_id) } }}, { "$group": { "_id": "$subject", "count": { "$sum": 1 } } } ] results = list(db.phenotype_results.aggregate(q)) results = sorted(results, key=lambda r: r['count'], reverse=True) except Exception as e: traceback.print_exc(file=sys.stdout) finally: client.close() return results
def process(self): client = MongoClient('localhost',44444) db_temp_train = client['vsm_all_second'] collection1_temp_train = db_temp_train['collection1'] collection2_temp_train = db_temp_train['collection2'] collection3_temp_train = db_temp_train['collection3'] collection4_temp_train = db_temp_train['collection4'] collection5_temp_train = db_temp_train['collection5'] lineNum = 1 pat = "sa(\d)(.*)" with open(os.path.join(self.fileroot,self.filename),"r") as fr: for line in fr: # 这是处理有的评论行的格式是异常的 if not re.findall(pat,line): print("\n " + str(lineNum) + " something wrong !") continue result = re.findall(pat,line) starNum = result[0][0] if starNum == '1': collection1_temp_train.insert(dict(content = list(jieba.cut(result[0][1].strip())))) elif starNum == '2': collection2_temp_train.insert(dict(content = list(jieba.cut(result[0][1].strip())))) elif starNum == '3': collection3_temp_train.insert(dict(content = list(jieba.cut(result[0][1].strip())))) elif starNum == '4': collection4_temp_train.insert(dict(content = list(jieba.cut(result[0][1].strip())))) elif starNum == '5': collection5_temp_train.insert(dict(content = list(jieba.cut(result[0][1].strip())))) print('process {0} lines'.format(lineNum),end='\r\t') lineNum += 1 client.close()
def dump_articles(): connection = MongoClient('localhost', 27017) db = connection.PTEST_BACKUP results = db.crawling.find({}, {'_id': False}) """ { "_id" : ObjectId("54dd29d2b396811764a01330"), "url" : "http://www.nasa.gov/pdf/55395main_12%20Earth%20Science.pdf", "home" : "NASA", "abstract" : "The mission of NASA's Earth Science ... and help answer qu estions concerning many related aspects of ... forecasters in assessing particul ate pollutio ...", "title" : "Earth Science - NASA", "keyword" : "aerosols+(pollution+aspects)", "stored" : true, "complete" : false, "key" : "aerosols (pollution aspects)", "hashed" : "aHR0cDovL3d3dy5uYXNhLmdvdi9wZGYvNTUzOTVtYWluXzEyJTIwRWFydGglMjBTY2llbmNlLnBkZg==" } """ # upload via POST endpoint from scripts.remote.remote import post_curling import json for record in results: post_curling(_CRAWLING_POST['local'], {'resource': json.dumps(record), 'pwd': _TEMP_SECRET}, display=True) # close the connection to MongoDB connection.close()
def insertTemplate(template): dbCon = MongoClient( databasePath ) database = dbCon['allTemplates'] collection = database['template'] template['_id'] = ObjectId() collection.insert_one(template) dbCon.close()
class MongoDBPipeline(object): collection_name = 'aqi' items = [] def __init__(self, mongo_uri, mongo_db): self.mongo_uri = mongo_uri self.mongo_db = mongo_db @classmethod def from_crawler(cls, crawler): return cls( mongo_uri=crawler.settings.get('MONGO_URI'), mongo_db=crawler.settings.get('MONGO_DATABASE') ) def open_spider(self, spider): print '------------connect to mongodb:', self.mongo_uri self.client = MongoClient(self.mongo_uri) self.db = self.client[self.mongo_db] def close_spider(self, spider): #insert to mongodb when close spider print '------------insert data:',len(self.items) print self.items self.db[self.collection_name].insert_many(self.items) self.client.close() def process_item(self, item, spider): self.items.append(item) #self.db[self.collection_name].insert(dict(item)) return item
def sync(): client=MongoClient(MONGO_URI) bi=client[MONGO_BI] target=bi['product_bi_final'] mapping=bi['mapping'] for each in mapping.find({}, {'bi_cat_id': 1, 'bi_cat_name': 1, '_id': 0, 'erp_cat_id': 1}): result = target.find({'category_id': each['bi_cat_id']}, {"updated_time": 1, '_id': 0}).sort([('updated_time', -1)]).limit(1) try: last_time = next(result)['updated_time'] except: last_time = datetime(1992, 8, 24) with mysql_con.cursor() as cur: sql=''' SELECT updated_time, product_url, sort_num, sort_type, sale_price, sale_num, rating, product_name, product_image, original_price, website_id AS dw_web_id, comment_count, goods_sn FROM website_product WHERE category_id =%s AND updated_time >= "%s" ORDER BY updated_time ASC ''' cur.execute(sql,(each['bi_cat_id']-cat_offset,last_time)) # 参数化调用防止SQL注入 for item in cur: data={ 'product_url':item[1], 'category_id':each['bi_cat_id'], 'erp_cat_id':each['erp_cat_id'], 'comment_count':int(item[11]), 'currency':"¥", "original_price" : 0, 'product_image':re.findall(r'https://.+?\.jpg|http://.+?\.jpg',item[8]), 'product_name':item[7], 'rating':item[6], 'sale_num':int(item[5]), 'sale_price':int(item[4]*100), 'sort_num':item[2], 'sort_type':item[3], 'updated_time':item[0], 'dw_web_name':dw_web_name, 'goods_sn':item[12], } target.update_one({'goods_sn':goods_sn,'dw_web_id':item[10]+web_offset},{'$set':data},upsert=True) print('processing {}'.format(data['product_url'])) client.close()
def getAllModules(): dbCon = MongoClient( databasePath ) database = dbCon['mibModules'] posts = database['mib'] output = posts.find() dbCon.close() return output
def processAdjustCube(countryThisRoundAdjust): global adjustDict global countryAdjust client = MongoClient() db = client['login_history'] col = db['col_adjust'] today = datetime.datetime.now().date() ds = datetime.datetime(*(today.timetuple()[:6])) for country,adjustList in countryThisRoundAdjust.iteritems(): intervallist = adjustList[0] numlist = adjustList[1] tmpResult = col.update({'recordtime':ds,'targetCountry':country}, {'$set':{'dateintervalList':intervallist,'numlist':numlist}}, upsert = True) print tmpResult for country,adjustList in countryThisRoundAdjust.iteritems(): countrydetail = adjustDict.get(country,dict()) intervallist = adjustList[0] numlist = adjustList[1] for index,interval in enumerate(intervallist): tmplist = countrydetail.get(interval,list()) tmplist.append(numlist[index]) countrydetail[interval] = tmplist adjustDict[country] = countrydetail getCountryAdjust() client.close()
class MyOffsiteMiddleware(OffsiteMiddleware): def __init__(self, *args, **kwargs): super(MyOffsiteMiddleware, self).__init__() self.client = None self.db = None self.link_collection = None def spider_opened(self, spider): super(MyOffsiteMiddleware, self).spider_opened(spider) dbname = settings.MONGO_DB['name'] collection_outlinks = settings.MONGO_DB['outlink_collection'] self.client = MongoClient() self.db = self.client[dbname] collection = self.db[collection_outlinks][spider.collection_name] if collection.name in self.db.collection_names(): self.db.drop_collection(collection.name) self.link_collection = collection def __del__(self): if self.client is not None: self.client.close() def should_follow(self, request, spider): ans = super(MyOffsiteMiddleware, self).should_follow(request, spider) if not ans: lnk = WalkerItem() lnk['status'] = '' lnk['parent'] = request.headers.get('Referer', '') lnk['response_hash'] = '' lnk['type'] = '' lnk['page'] = request.url self.link_collection.insert(dict(lnk)) return ans
def check(self, resource, project_id, timestamp, value): client = MongoClient(self.uri) collection = client.log_service.quotas conditions = {'resource':resource, 'project_id':project_id, 'timestamp':timestamp, 'value':value} is_saved = collection.find(conditions).count() > 0 client.close() return is_saved
def remove_peer(peer): host=(socket.gethostname()) c = MongoClient(host, 27017) db=c.tejo status=db.status status.remove({'peer':peer}) c.close()
def main(start_from=None): # Load config information load_config() # Connect to RabbitMQ Queue connection, channel = get_queue_channel(RABBIT_HOST) # Connect to databse client = MongoClient(MONGO_HOST, MONGO_PORT) db = client[MONGO_DB] db.authenticate(MONGO_USER, MONGO_PWD) db_repos = db[MONGO_COLL] last_id = start_from # Last id variable to be advanced reauth = True # Reauth variable to check if we need to reauthenticate for GitHub gh = None # Just for good measure while 1: # Authenticate on GitHub and get all repos if reauth: gh = github3.login(GH_USERS[GH_CUR_USR]['login'], GH_USERS[GH_CUR_USR]['pwd']) repos = gh.iter_all_repos(since=last_id) # Crawl repos reauth, last_id = start_crawl(repos, db_repos, gh, channel, last_id) #Close connection to databse client.close() #Close connection to queue channel.close() connection.close()
class MongoStore(Store): def __init__(self, subscription_id, config): super().__init__(subscription_id) self._client = MongoClient(config['MONGO_HOST'], config['MONGO_PORT']) self._db = self._client.get_database(config['MONGO_DATABASE']) self._collection = self._db[config['MONGO_COLLECTION']] def set_state(self, state): self._collection.replace_one({'_id': self._subscription_id}, state, upsert=True) def set_value(self, key, value): self._collection.update_one({'_id': self._subscription_id}, {'$set': {key: value}}, upsert=True) def push_all(self, key, values): self._collection.update_one({'_id': self._subscription_id}, {'$push': {key: {'$each': values}}}, upsert=True) def get_value(self, key, default=None): state = self.get_state() if state: return state.get(key, default) return default def get_state(self): return self._collection.find_one({'_id': self._subscription_id}) def get_collection(self): return self._collection def close(self): self._client.close()
class MTSGetdataPipeline(object): def __init__(self, mongo_uri, mongo_db): self.mongo_uri = mongo_uri self.mongo_db = mongo_db @classmethod def from_crawler(cls, crawler): return cls( mongo_uri = crawler.settings.get("MONGO_URI"), mongo_db = crawler.settings.get("MONGO_DATABASE") ) def open_spider(self, spider): self.client = MongoClient(self.mongo_uri) self.db = self.client[self.mongo_db] def process_item(self, item, spider): # collection_name = self.__class__.__name__ # tmp = dict(item) # print "***************************", tmp, "!!!!!!!!!!!!!!!!!!!!!!!" # self.db[collection_name].insert(tmp) JsonFile.append(dict(item)) return item # return None def close_spider(self, spider): print len(JsonFile) fp = open("TestMidi.json","wb") fp.write(json.dumps(JsonFile)) self.client.close()
def init_from_mongo(self): client = MongoClient('mongodb://localhost:27017/') db = client.ptt posts = db.gossiping_38k jieba.set_dictionary('extra_dict/dict.txt.big') jieba.analyse.set_stop_words("extra_dict/stop_words_cht.txt") for post in posts.find(): #For content d = defaultdict(int) content = post['content'] if post['score'] != 0: for l in content.split('\n'): if l: for w in jieba.cut(l): d[w] += 1 if len(d) > 0: self.words.append(d) self.scores.append(1 if post['score'] > 0 else 0) #For comments for comment in post['comments']: l = comment['content'].strip() if l and comment['score'] != 0: d = defaultdict(int) for w in jieba.cut(l): d[w] += 1 if len(d) > 0: self.c_words.append(d) self.c_scores.append(1 if comment['score'] > 0 else 0) client.close()
def add_domain(self, domain, ip, key, ttl = 120, timestamp = time.time() ): if domain not in self._domains: self._domains[domain] = Domain(domain, ip, key, ttl, timestamp) c = MongoClient("localhost",27017).p2pdns c.domains.insert_one({"domain":domain,"ip":ip,"key":key,"ttl":ttl,"timestamp":timestamp}) c.close()
class Application(tornado.web.Application): def __init__(self): '''Store necessary handlers, connect to database ''' handlers = [(r"/[/]?", BaseHandler), (r"/GetLocations[/]?", predictionHandlers.GetLocationsHandler), (r"/GetLandmarks[/]?", predictionHandlers.GetLandmarksHandler), (r"/AddLearningData[/]?", predictionHandlers.AddLearningDataHandler), (r"/PredictLocation[/]?", predictionHandlers.PredictLocationHandler) ] settings = {'debug':True} tornado.web.Application.__init__(self, handlers, **settings) self.client = MongoClient() # local host, default port self.db = self.client.exploreSMU # sklearndatabase # database with labeledinstances, models self.clf = [] #self.client.close() # this opened a socket -- lets close that connection def __exit__(self): self.client.close() # just in case
def store_in_mongo(lst_of_dcts, db_name, collection_name, key=None): """Store the list of dictionaries in Mongo Args: lst_of_dicts: List of dictionaries to insert into Mongo. db_name: String - database name collection_name: String - collection name """ client = MongoClient() db = client[db_name] collection = db[collection_name] if key is not None: store_in_mongo_by_key(lst_of_dcts, collection, key) else: # Check if the length is one, in which case we need to use # insert_one. Otherwise, make sure that it's not empty (i.e. # the `elif` statement) below, and then insert many. If it's # empty, then just don't do anything and close the client. if len(lst_of_dcts) == 1: collection.insert_one(lst_of_dcts[0]) elif lst_of_dcts: collection.insert_many(lst_of_dcts) client.close()
def write2mongo(stations): """ 将车站信息写入到mongodb中 :param stations: 车站信息,用 | 分隔统一车站的各个信息 :return: """ try: logger = logging.getLogger(__name__) logger.info('starting write station info') con = MongoClient('localhost', 27017) # 连接mongodb data_list = [] for station in stations: parts = ('' + station).split('|') data = { 'Chinese': parts[1], 'ext': parts[2], 'pinyin': parts[3], 'abbr': parts[4], 'order': parts[5] } data_list.append(data) local = con.get_database('python') collection = local.get_collection('12306_station') collection.remove() # clear data before insert ,in case duplicate collection.insert_many(data_list) con.close() logger.info('write {0} station info : Done'.format(len(data_list))) except IOError as e: logger.error(e)
class MongoOperator: def __init__(self, db): self.dbName = db def setUpConnection(self): self.client = MongoClient("localhost", 27017) self.db = self.client[self.dbName] def setUpCollection(self, collName): if collName in self.db.collection_names(): self.collection = self.db.get_collection(collName) else: self.db.create_collection(collName) self.collection = self.db.get_collection(collName) def getOne(self): print(self.collection.find_one()) def getAll(self): return self.collection.find({}) def insertOne(self, res): self.collection.insert_one(res) def insertMany(self, listofRes): self.collection.insert_many(listofRes) def closeConnection(self): self.client.close()
class MongoDBPipeline(object): def __init__(self, mongodb_server, mongodb_port, mongodb_db, mongodb_collection): self.mongodb_server = mongodb_server self.mongodb_port = mongodb_port self.mongodb_db = mongodb_db self.mongodb_collection = mongodb_collection @classmethod def from_crawler(cls, crawler): print "in crawler" return cls( mongodb_server= crawler.settings.get('MONGODB_SERVER'), mongodb_port = int(crawler.settings.get('MONGODB_PORT')), mongodb_db=crawler.settings.get('MONGODB_DB'), mongodb_collection=crawler.settings.get('MONGODB_COLLECTION') ) def open_spider(self, spider): self.client = MongoClient(self.mongodb_server, self.mongodb_port) self.db = self.client[self.mongodb_db] self.collection = self.db[self.mongodb_collection] def close_spider(self, spider): self.client.close() def process_item(self, item, spider): print "in pipeline" #log.msg("begin insert data", level=log.DEBUG, splider=splider) self.collection.insert(dict(item)) return item
def add_node(self, host, port): if host not in self._nodes: self._nodes[host] = int(port) c = MongoClient("localhost",27017).p2pdns c.nodes.insert_one({"ip":host,"port":port}) c.close()
class MongoAnalyticsTest(unittest.TestCase): # # Initialisation des membres privés à compléter def setUp(self): self.mongoclient = MongoClient() self.db = self.mongoclient.analytics self.hits = self.db.hits def tearDown(self): self.mongoclient.close() """ Trouver la plus grande affluence (nombre de requêtes par jour) pour l'url http://www.lateral-thoughts.com avec le framework d'aggrégation Doc à lire : - $year http://docs.mongodb.org/manual/reference/operator/aggregation/year/ - $month http://docs.mongodb.org/manual/reference/operator/aggregation/month/ - $dayOfMonth http://docs.mongodb.org/manual/reference/operator/aggregation/dayOfMonth/ """ @unittest.skip('Remove to play this test') def testFindHighestHitsForUrl(self): pipeline = [] result = self.hits.aggregate(pipeline) self.assertEqual(result['result'][0]['hits'], 66) self.assertEqual(result['result'][0]['_id']['y'], 2012) self.assertEqual(result['result'][0]['_id']['m'], 3) self.assertEqual(result['result'][0]['_id']['d'], 23)
def run(host,database,graphA,graphB,handle,displaymode): # Create an empty response object. response = {} # look through the collections in the ivaan database and return the name of all collections # that match the naming profile for tables. This is matching to see if the collection name # begins with "seeds_" or not, since this routine can return the matching graphs (that don't start # with 'seeds_') or the matching seeds. # build topk collection name from topk_collection_name = 'topk_'+graphA+'_'+graphB #topk_collection_name = 'topk_twitter_geosample_mentions_v2_october_combined_instagram_mentions_nodelink_october' print 'looking for topk in collection', topk_collection_name #topk_collection_name = 'topk' client = MongoClient(host, 27017) db = client[database] topk_collection = db[topk_collection_name] # get a list of all collections (excluding system collections) query = {'ga':handle} tablerows = [] # return only the columns to potentially display in LineUp. We don't want to return the gA entity we used to search by topk = topk_collection.find(query,{'_id':0,'ga':0}) for row in topk: tablerows.append(row) client.close() # Pack the results into the response object, and return it. response['result'] = tablerows # Return the response object. #tangelo.log(str(response)) return json.dumps(response)
class ContextClassHarvester: #DEFAULT_CONFIG_SECTION = 'CONFIG' #HARVESTER_MONGO_HOST = 'harvester.mongo.host' #HARVESTER_MONGO_PORT = 'harvester.mongo.port' #ORGHARVESTER_MONGO_HOST = 'organization.harvester.mongo.host' #ORGHARVESTER_MONGO_PORT = 'organization.harvester.mongo.port' LOG_LOCATION = 'logs/entlogs/' CHUNK_SIZE = 250 # each file will consist of 250 entities WRITEDIR = os.path.join(os.path.dirname(__file__), '..', 'entities_out') CONFIG_DIR = os.path.join(os.path.dirname(__file__), '..', 'config') LANG_VALIDATOR = LanguageValidator() LABEL = 'label' TYPE = 'type' TYPE_STRING = 'string' TYPE_OBJECT = 'obj' TYPE_REF = 'ref' PROP_OWL_SAMEAS = 'owlSameAs' #TODO remove when whole code is switched to use the EnrichmentEntity language constants LANG_DEF = EnrichmentEntity.LANG_DEF LANG_EN = EnrichmentEntity.LANG_EN IGNORED_PROPS = ['about', '_id', "className", "edmOrganizationSector"] FIELD_MAP = { # maps mongo fields to their solr equivalents # TODO: there are numerous fields defined in the schema but not # found in the actual data. They are accordingly not represented here. # For a list of all fields that might conceivably exist in accordance # with the data model, see https://docs.google.com/spreadsheets/d/ # 1b1UN27M2eCia0L54di0KQY7KcndTq8-wxzwM4wN-8DU/edit#gid=340708208 'prefLabel': { LABEL: 'skos_prefLabel', TYPE: TYPE_STRING }, 'altLabel': { LABEL: 'skos_altLabel', TYPE: TYPE_STRING }, 'hiddenLabel': { LABEL: 'skos_hiddenLabel', TYPE: TYPE_STRING }, 'edmAcronym': { LABEL: 'edm_acronym', TYPE: TYPE_STRING }, 'note': { LABEL: 'skos_note', TYPE: TYPE_STRING }, 'begin': { LABEL: 'edm_begin', TYPE: TYPE_STRING }, 'end': { LABEL: 'edm_end', TYPE: TYPE_STRING }, 'owlSameAs': { LABEL: 'owl_sameAs', TYPE: TYPE_REF }, 'edmIsRelatedTo': { LABEL: 'edm_isRelatedTo', TYPE: TYPE_REF }, 'dcIdentifier': { LABEL: EnrichmentEntity.DC_IDENTIFIER, TYPE: TYPE_STRING }, 'dcDescription': { LABEL: 'dc_description', TYPE: TYPE_STRING }, 'rdaGr2DateOfBirth': { LABEL: 'rdagr2_dateOfBirth', TYPE: TYPE_STRING }, #not used yet #'rdaGr2DateOfEstablishment' : { 'label': 'rdagr2_dateOfEstablishment' , TYPE : TYPE_STRING }, 'rdaGr2DateOfDeath': { LABEL: 'rdagr2_dateOfDeath', TYPE: TYPE_STRING }, #not used yet #'rdaGr2DateOfTermination' : { 'label': 'rdagr2_dateOfTermination' , TYPE : TYPE_STRING }, 'rdaGr2PlaceOfBirth': { LABEL: 'rdagr2_placeOfBirth', TYPE: TYPE_STRING }, 'placeOfBirth': { LABEL: 'rdagr2_placeOfBirth', TYPE: TYPE_STRING }, #not used yet #'placeOfBirth_uri' : { 'label': 'rdagr2_placeOfBirth.uri' , TYPE : TYPE_STRING }, 'rdaGr2PlaceOfDeath': { LABEL: 'rdagr2_placeOfDeath', TYPE: TYPE_STRING }, #not used yet #'placeOfDeath_uri' : { 'label': 'rdagr2_placeOfDeath.uri' , TYPE : TYPE_STRING }, 'rdaGr2PlaceOfDeath': { LABEL: 'rdagr2_placeOfDeath', TYPE: TYPE_STRING }, #not used yet #'professionOrOccupation_uri' : { 'label': 'professionOrOccupation.uri' , TYPE : TYPE_STRING }, 'rdaGr2ProfessionOrOccupation': { LABEL: 'rdagr2_professionOrOccupation', TYPE: TYPE_STRING }, #not used yet #'gender' : { 'label': 'gender' , TYPE : TYPE_STRING }, 'rdaGr2Gender': { LABEL: 'rdagr2_gender', TYPE: TYPE_STRING }, 'rdaGr2BiographicalInformation': { LABEL: 'rdagr2_biographicalInformation', TYPE: TYPE_STRING }, 'latitude': { LABEL: 'wgs84_pos_lat', TYPE: TYPE_STRING }, 'longitude': { LABEL: 'wgs84_pos_long', TYPE: TYPE_STRING }, #not used yet #'beginDate' : { 'label': 'edm_beginDate' , TYPE : TYPE_STRING }, #not used yet #'endDate' : { 'label': 'edm_endDate' , TYPE : TYPE_STRING }, 'isPartOf': { LABEL: 'dcterms_isPartOf', TYPE: TYPE_REF }, #edm_isNextInSequence 'isNextInSequence': { LABEL: 'edm_isNextInSequence', TYPE: TYPE_REF }, 'hasPart': { LABEL: 'dcterms_hasPart', TYPE: TYPE_REF }, 'hasMet': { LABEL: 'edm_hasMet', TYPE: TYPE_REF }, 'date': { LABEL: 'dc_date', TYPE: TYPE_STRING }, 'exactMatch': { LABEL: 'skos_exactMatch', TYPE: TYPE_STRING }, 'related': { LABEL: 'skos_related', TYPE: TYPE_REF }, 'broader': { LABEL: 'skos_broader', TYPE: TYPE_REF }, 'narrower': { LABEL: 'skos_narrower', TYPE: TYPE_REF }, 'related': { LABEL: 'skos_related', TYPE: TYPE_REF }, 'broadMatch': { LABEL: 'skos_broadMatch', TYPE: TYPE_REF }, 'narrowMatch': { LABEL: 'skos_narrowMatch', TYPE: TYPE_REF }, 'relatedMatch': { LABEL: 'skos_relatedMatch', TYPE: TYPE_REF }, 'exactMatch': { LABEL: 'skos_exactMatch', TYPE: TYPE_REF }, 'closeMatch': { LABEL: 'skos_closeMatch', TYPE: TYPE_REF }, 'notation': { LABEL: 'skos_notation', TYPE: TYPE_REF }, 'inScheme': { LABEL: 'skos_inScheme', TYPE: TYPE_REF }, 'note': { LABEL: 'skos_note', TYPE: TYPE_STRING }, 'foafLogo': { LABEL: 'foaf_logo', TYPE: TYPE_REF }, 'foafDepiction': { LABEL: 'foaf_depiction', TYPE: TYPE_REF }, # not used yet #name' : { 'label' : 'foaf_name', TYPE : TYPE_STRING }, 'foafHomepage': { LABEL: 'foaf_homepage', TYPE: TYPE_REF }, 'foafPhone': { LABEL: 'foaf_phone', TYPE: TYPE_STRING }, 'foafMbox': { LABEL: 'foaf_mbox', TYPE: TYPE_STRING }, 'edmCountry': { LABEL: EnrichmentEntity.COUNTRY, TYPE: TYPE_STRING }, 'edmEuropeanaRole': { LABEL: EnrichmentEntity.EUROPEANA_ROLE, TYPE: TYPE_STRING }, 'edmOrganizationDomain': { LABEL: EnrichmentEntity.ORGANIZATION_DOMAIN, TYPE: TYPE_STRING }, #TODO: remove, not supported anymore #'edmOrganizationSector' : { 'label' : 'edm_organizationSector', TYPE : TYPE_STRING}, #'edmOrganizationScope' : { 'label' : 'edm_organizationScope', TYPE : TYPE_STRING}, 'edmGeographicLevel': { LABEL: EnrichmentEntity.GEOGRAPHIC_LEVEL, TYPE: TYPE_STRING }, 'address': { LABEL: 'vcard_hasAddress', TYPE: TYPE_OBJECT }, #not sure if used anymore 'address_about': { LABEL: 'vcard_hasAddress', TYPE: TYPE_STRING }, 'vcardStreetAddress': { LABEL: 'vcard_streetAddress', TYPE: TYPE_STRING }, 'vcardLocality': { LABEL: 'vcard_locality', TYPE: TYPE_STRING }, #not used yet #'vcardRegion' : { LABEL : 'vcard_region', TYPE : TYPE_STRING }, 'vcardPostalCode': { LABEL: 'vcard_postalCode', TYPE: TYPE_STRING }, 'vcardCountryName': { LABEL: 'vcard_countryName', TYPE: TYPE_STRING }, 'vcardPostOfficeBox': { LABEL: 'vcard_postOfficeBox', TYPE: TYPE_STRING }, 'vcardHasGeo': { LABEL: 'hasGeo', TYPE: TYPE_STRING } } def log_warm_message(self, entity_id, message): # TODO: differentiate logfiles by date filename = "warn.txt" filepath = LanguageValidator.LOG_LOCATION + filename with open(filepath, 'a') as lgout: msg = "Warning info on processing entity " + str( entity_id) + ": " + str(message) lgout.write(msg) lgout.write("\n") # TODO: add address processing def __init__(self, entity_type): sys.path.append(os.path.join(os.path.dirname(__file__))) sys.path.append( os.path.join(os.path.dirname(__file__), 'ranking_metrics')) sys.path.append( os.path.join(os.path.dirname(__file__), 'preview_builder')) from pymongo import MongoClient #import PreviewBuilder #import HarvesterConfig self.config = HarvesterConfig() #TODO: remove field name and use entity type self.name = entity_type + 's' self.client = MongoClient(self.get_mongo_host()) self.ranking_model = self.config.get_relevance_ranking_model() self.write_dir = ContextClassHarvester.WRITEDIR + "/" + self.ranking_model #TODO create working dir here, including folders for individual entities and organization type self.entity_type = entity_type self.preview_builder = PreviewBuilder.PreviewBuilder( self.client, entity_type) self.depiction_manager = DepictionManager.DepictionManager(self.config) def get_mongo_host(self): #return default mongo host, the subclasses may use the type based config (e.g. see organizations) return self.config.get_mongo_host() #def get_mongo_port (self): #return default mongo port, the subclasses may use the type based config (e.g. see also organizations host) #return self.config.get_mongo_port() def get_entity_count(self): entities = self.client.get_database( HarvesterConfig.DB_ENRICHMENT).get_collection( HarvesterConfig.COL_ENRICHMENT_TERM).find({ 'entityType': self.entity_type.upper(), EnrichmentEntity.ENTITY_ID: { '$regex': 'http://data.europeana.eu/.*' } }).count() return entities def build_entity_chunk(self, start): #TODO rename variables, places-> entity entities = self.client.get_database( HarvesterConfig.DB_ENRICHMENT).get_collection( HarvesterConfig.COL_ENRICHMENT_TERM).find( { 'entityType': self.entity_type.upper(), EnrichmentEntity.ENTITY_ID: { '$regex': 'http://data.europeana.eu/.*' } }, { EnrichmentEntity.ENTITY_ID: 1, '_id': 0 })[start:start + ContextClassHarvester.CHUNK_SIZE] entities_chunk = {} for entity in entities: entity_id = entity[EnrichmentEntity.ENTITY][EnrichmentEntity.ABOUT] entities_chunk[entity_id] = self.client.get_database( HarvesterConfig.DB_ENRICHMENT).get_collection( HarvesterConfig.COL_ENRICHMENT_TERM).find_one( {EnrichmentEntity.ENTITY_ID: entity_id}) return entities_chunk def extract_numeric_id(self, entity_id): parts = entity_id.split("/") #numeric id is the last part of the URL return parts[len(parts) - 1] def build_solr_doc(self, entities, start, one_entity=False): from xml.etree import ElementTree as ET docroot = ET.Element('add') for entity_id, values in entities.items(): print("processing entity:" + entity_id) self.build_entity_doc(docroot, entity_id, values) self.client.close() return self.write_to_file(docroot, start, one_entity) def build_entity_doc(self, docroot, entity_id, entity_rows): #sys.path.append('ranking_metrics') from xml.etree import ElementTree as ET doc = ET.SubElement(docroot, 'doc') self.add_field(doc, 'id', entity_id) #self.add_field(doc, 'internal_type', 'Place') self.add_field(doc, 'internal_type', self.entity_type.capitalize()) self.process_created_modified_timestamps(doc, entity_rows) self.process_representation(doc, entity_id, entity_rows) def add_field_list(self, docroot, field_name, values): if (values is None): return for value in values: self.add_field(docroot, field_name, value) def add_field(self, docroot, field_name, field_value): from xml.etree import ElementTree as ET f = ET.SubElement(docroot, 'field') f.set('name', field_name) try: f.text = self.sanitize_field(field_value) except Exception as ex: print(str(field_name) + "!" + str(field_value) + str(ex)) def sanitize_field(self, field_value): field_value = field_value.replace("\n", " ") field_value = field_value.replace("\\n", " ") field_value = field_value.replace("\t", " ") return field_value def write_to_file(self, doc, start, one_entity): from xml.etree import ElementTree as ET from xml.dom import minidom import io writepath = self.get_writepath(start, one_entity) roughstring = ET.tostring(doc, encoding='utf-8') reparsed = minidom.parseString(roughstring) reparsed = reparsed.toprettyxml(encoding='utf-8', indent=" ").decode('utf-8') with io.open(writepath, 'w', encoding='utf-8') as writefile: writefile.write(reparsed) writefile.close() return writepath def get_writepath(self, start, one_entity): if (one_entity): return self.write_dir + "/individual_entities/" + self.name + "/" + str( start) + ".xml" else: return self.write_dir + "/" + self.name + "/" + self.name + "_" + str( start) + "_" + str(start + ContextClassHarvester.CHUNK_SIZE) + ".xml" def grab_relevance_ratings(self, docroot, entity_id, entity): metrics_record = self.relevance_counter.get_raw_relevance_metrics( entity) eu_enrichments = metrics_record.uri_hits eu_terms = metrics_record.term_hits pagerank = metrics_record.pagerank if (self.ranking_model == self.config.HARVESTER_RELEVANCE_RANKING_MODEL_DEFAULT): ds = self.relevance_counter.calculate_relevance_score( entity_id, pagerank, eu_enrichments, eu_terms) elif (self.ranking_model == self.config.HARVESTER_RELEVANCE_RANKING_MODEL_NORMALIZED): ds = self.relevance_counter.calculate_normalized_score( pagerank, eu_enrichments, eu_terms) else: raise ValueError( "Must set property harvester.relevance.ranking.model to one of the values <default> or <normalized>" ) self.add_field(docroot, 'europeana_doc_count', str(eu_enrichments)) self.add_field(docroot, 'europeana_term_hits', str(eu_terms)) self.add_field(docroot, 'pagerank', str(pagerank)) self.add_field(docroot, 'derived_score', str(ds)) self.add_suggest_filters(docroot, eu_enrichments) return True def grab_isshownby(self, docroot, web_resource): if (web_resource is not None): self.add_field(docroot, 'isShownBy', web_resource.media_url) self.add_field(docroot, 'isShownBy.source', web_resource.europeana_item_id) self.add_field(docroot, 'isShownBy.thumbnail', web_resource.thumbnail_url) def process_address(self, docroot, entity_id, address): #TODO check if the full address is needed #address_components = [] for k, v in address.items(): key = k value = v #about is not an ignored property for address if ("about" == k): key = "address_" + k elif ("vcardHasGeo" == k): #remove geo:, keep just lat,long value = v.split(":")[-1] if (self.is_ignored_property(key)): #ignored properties are not mapped to solr document continue if (key not in ContextClassHarvester.FIELD_MAP.keys()): self.log_warm_message(entity_id, "unmapped field: " + key) continue field_name = ContextClassHarvester.FIELD_MAP[key][self.LABEL] if ("vcardHasGeo" != k): field_name = field_name + ".1" self.add_field(docroot, field_name, value) #address_components.append(v) def process_created_modified_timestamps(self, docroot, entity_rows): # Solr time format YYYY-MM-DDThh:mm:ssZ if "created" in entity_rows: self.add_field(docroot, 'created', entity_rows["created"].isoformat() + "Z") #"modified" changed to updated in the database if "updated" in entity_rows: self.add_field(docroot, "modified", entity_rows["updated"].isoformat() + "Z") def is_ignored_property(self, characteristic): return str(characteristic) in self.IGNORED_PROPS def process_representation(self, docroot, entity_id, entity): #all pref labels all_preflabels = [] for characteristic in entity[EnrichmentEntity.REPRESENTATION]: if (self.is_ignored_property(characteristic)): continue elif (str(characteristic) not in ContextClassHarvester.FIELD_MAP.keys()): # TODO: log this? print("unmapped property: " + str(characteristic)) continue elif (characteristic == "address"): self.process_address( docroot, entity_id, entity[EnrichmentEntity.REPRESENTATION]['address']) # TODO: Refactor horrible conditional elif (str(characteristic) == "dcIdentifier"): self.add_field_list( docroot, EnrichmentEntity.DC_IDENTIFIER, entity[EnrichmentEntity.REPRESENTATION]['dcIdentifier'][ EnrichmentEntity.LANG_DEF]) elif (str(characteristic) == "edmOrganizationDomain"): #TODO: create method to add solr field for .en fields self.add_field( docroot, EnrichmentEntity.ORGANIZATION_DOMAIN + "." + EnrichmentEntity.LANG_EN, entity[EnrichmentEntity.REPRESENTATION] ['edmOrganizationDomain'][EnrichmentEntity.LANG_EN]) elif (str(characteristic) == "edmEuropeanaRole"): #multivalued roles = entity[EnrichmentEntity.REPRESENTATION][ 'edmEuropeanaRole'][EnrichmentEntity.LANG_EN] self.add_field_list( docroot, EnrichmentEntity.EUROPEANA_ROLE + "." + EnrichmentEntity.LANG_EN, roles) elif (str(characteristic) == "edmGeographicLevel"): self.add_field( docroot, EnrichmentEntity.GEOGRAPHIC_LEVEL + "." + EnrichmentEntity.LANG_EN, entity[EnrichmentEntity.REPRESENTATION] ['edmGeographicLevel'][EnrichmentEntity.LANG_EN]) elif (str(characteristic) == "edmCountry"): self.add_field( docroot, EnrichmentEntity.COUNTRY, entity[EnrichmentEntity.REPRESENTATION]['edmCountry'][ EnrichmentEntity.LANG_EN]) elif (str(characteristic) == "begin"): #pick first value from default language for timestamps, need to check for agents self.add_field( docroot, EnrichmentEntity.EDM_BEGIN, entity[EnrichmentEntity.REPRESENTATION]['begin'][ EnrichmentEntity.LANG_DEF][0]) elif (str(characteristic) == "end"): #pick first value from default language for timestamps, need to check for agents self.add_field( docroot, EnrichmentEntity.EDM_END, entity[EnrichmentEntity.REPRESENTATION]['end'][ EnrichmentEntity.LANG_DEF][0]) elif (type(entity[EnrichmentEntity.REPRESENTATION][characteristic]) is dict): # hiddenLabels are currenlty used only for Timespans if (str(characteristic) == "hiddenLabel" and self.ignore_hidden_label()): continue #for each entry in the language map for lang in entity[ EnrichmentEntity.REPRESENTATION][characteristic]: pref_label_count = 0 #avoid duplicates when adding values from prefLabel prev_alts = [] if (ContextClassHarvester.LANG_VALIDATOR. validate_lang_code(entity_id, lang)): field_name = ContextClassHarvester.FIELD_MAP[ characteristic][self.LABEL] field_values = entity[EnrichmentEntity.REPRESENTATION][ characteristic][lang] #property is language map of strings if (type(field_values) == str): lang_code = lang if lang != EnrichmentEntity.LANG_DEF else '' q_field_name = field_name + "." + lang_code #field value = field_values self.add_field(docroot, q_field_name, field_values) else: #for each value in the list for field_value in field_values: q_field_name = field_name lang_code = lang if lang != EnrichmentEntity.LANG_DEF else '' if (ContextClassHarvester. FIELD_MAP[characteristic][ self.TYPE] == self.TYPE_STRING): q_field_name = field_name + "." + lang_code # Code snarl: we often have more than one prefLabel per language in the data # We can also have altLabels # We want to shunt all but the first-encountered prefLabel into the altLabel field # while ensuring the altLabels are individually unique # TODO: Refactor (though note that this is a non-trivial refactoring) # NOTE: prev_alts are for one language, all_preflabels include labels in any language if (characteristic == 'prefLabel' and pref_label_count > 0): #move all additional labels to alt label q_field_name = "skos_altLabel." + lang_code #SG - TODO: add dropped pref labels to prev_alts?? #prev_alts.append(field_value) if ('altLabel' in q_field_name): #TODO: SG why this? we skip alt labels here, but we don't add the gained entries from prefLabels if (field_value in prev_alts): continue prev_alts.append(field_value) #suggester uses alt labels for some entity types (organizations) #disables until altLabels are added to payload #self.add_alt_label_to_suggest(field_value, all_preflabels) if (str(characteristic) == "edmAcronym"): #suggester uses alt labels for some entity types (organizations) self.add_acronym_to_suggest( field_value, all_preflabels) if (characteristic == 'prefLabel' and pref_label_count == 0): pref_label_count = 1 #TODO: SG - the suggester could actually make use of all pref labels, but the hightlighter might crash all_preflabels.append(field_value) #add field to solr doc self.add_field(docroot, q_field_name, field_value) #property is list elif (type(entity[EnrichmentEntity.REPRESENTATION][characteristic]) is list): field_name = ContextClassHarvester.FIELD_MAP[characteristic][ self.LABEL] for entry in entity[ EnrichmentEntity.REPRESENTATION][characteristic]: self.add_field(docroot, field_name, entry) # property is a single value else: try: field_name = ContextClassHarvester.FIELD_MAP[ characteristic][self.LABEL] field_value = entity[ EnrichmentEntity.REPRESENTATION][characteristic] self.add_field(docroot, field_name, str(field_value)) except KeyError as error: print( 'Attribute found in source but undefined in schema.' + str(error)) #add suggester payload web_resource = self.depiction_manager.get_depiction(entity_id) self.grab_isshownby(docroot, web_resource) payload = self.build_payload(entity_id, entity, web_resource) self.add_field(docroot, 'payload', json.dumps(payload)) #add suggester field all_preflabels = self.shingle_preflabels(all_preflabels) # SG: values in the same language are joined using space separator. values in different languages are joined using underscore as it is used as tokenization pattern. see schema.xml self.add_field(docroot, 'skos_prefLabel', "_".join(sorted(set(all_preflabels)))) depiction = self.preview_builder.get_depiction(entity_id) if (depiction): self.add_field(docroot, 'foaf_depiction', depiction) self.grab_relevance_ratings(docroot, entity_id, entity) def shingle_preflabels(self, preflabels): shingled_labels = [] for label in preflabels: all_terms = label.split() for i in range(len(all_terms)): shingle = " ".join(all_terms[i:len(all_terms)]) shingled_labels.append(shingle) return shingled_labels def build_payload(self, entity_id, entity_rows, web_resource): payload = self.preview_builder.build_preview( self.entity_type, entity_id, entity_rows[EnrichmentEntity.REPRESENTATION], web_resource) return payload def add_suggest_filters(self, docroot, enrichment_count): self.add_field(docroot, 'suggest_filters', self.entity_type.capitalize()) if (enrichment_count > 0): self.add_field(docroot, 'suggest_filters', 'in_europeana') def suggest_by_alt_label(self): #this functionality can be activated by individual harvesters return False def suggest_by_acronym(self): #this functionality can be activated by individual harvesters return False def add_alt_label_to_suggest(self, value, suggester_values): if (self.suggest_by_alt_label() and (value not in suggester_values)): suggester_values.append(value) def add_acronym_to_suggest(self, value, suggester_values): if (self.suggest_by_acronym() and (value not in suggester_values)): suggester_values.append(value) def ignore_hidden_label(self): return True
from pymongo import MongoClient conn = MongoClient('localhost', 27017) db = conn.stu myset = db.class4 # myset.insert({'name':'张铁林','king':'乾隆'}) # myset.insert({'name':'张国立','king':'康熙'},{'name':'陈道明','king':'康熙'}) # myset.insert_many([{'name':'唐国强','king':'雍正'},{'name':'陈建斌','king':'雍正'}]) # cursor = myset.find({},{'_id':0}) # print(cursor) # for i in cursor: # print(i) # myset1 = db.class0 # cursor = myset1.find({'$or':[{'sex':'w'},{'age':{'$gt':19}}]}) # for i in cursor: # print(i) myset.remove({'gender': 'null'}) conn.close()
class Mongo(Database): EXEC_COUNT_COL = "exec_count" PREV_HTML_COL = "prev_html" PREV_DIFF_COL = "prev_diff" def __init__(self, setting): self.setting = setting username = urllib.parse.quote_plus(setting['username']) password = urllib.parse.quote_plus(setting['password']) self.client = MongoClient('mongodb://%s:%s@%s:%s/%s' % (username, password, setting['hostname'], setting['port'], setting['database'])) self.db = self.client[setting['database']] if not Mongo.EXEC_COUNT_COL in self.db.list_collection_names(): self.db.create_collection(Mongo.EXEC_COUNT_COL) self.db[Mongo.EXEC_COUNT_COL].insert_one({"count": 1, "id": 1}) if not Mongo.PREV_HTML_COL in self.db.list_collection_names(): self.db.create_collection(Mongo.PREV_HTML_COL) self.db[Mongo.PREV_HTML_COL].insert_one({"html": "", "id": 1}, ) if not Mongo.PREV_DIFF_COL in self.db.list_collection_names(): self.db.create_collection(Mongo.PREV_DIFF_COL) def __del__(self): self.client.close() def insert(self, prev_html, prev_diff): self.update_previous_html(prev_html) self.db[Mongo.PREV_DIFF_COL].insert_one(prev_diff) def drop(self): self.db.drop_collection(Mongo.EXEC_COUNT_COL) self.db.drop_collection(Mongo.PREV_HTML_COL) self.db.drop_collection(Mongo.PREV_DIFF_COL) def get_exec_count(self): count_data = self.db[Mongo.EXEC_COUNT_COL].find_one() if not count_data: self.db[Mongo.EXEC_COUNT_COL].insert_one({"count": 1, "id": 1}) return 1 return count_data["count"] def update_exec_count(self): prev_count = int(self.db[Mongo.EXEC_COUNT_COL].find_one()["count"]) self.db[Mongo.EXEC_COUNT_COL].find_one_and_update( {"id": 1}, {'$set': { "count": prev_count + 1 }}) def get_previous_html(self): html_data = self.db[Mongo.PREV_HTML_COL].find_one() return html_data["html"] def update_previous_html(self, new_html): self.db[Mongo.PREV_HTML_COL].find_one_and_update( {"id": 1}, {'$set': { "html": new_html }}) def find_diff_from_previous(self, target): return self.db[Mongo.PREV_DIFF_COL].find_one({"diff": target}) def _get_previous_diff_max_id(self): max_id = 0 results = self.db[Mongo.PREV_DIFF_COL].find().sort('id', DESCENDING).limit(1) for c in results: if max_id < int(c["id"]): max_id = int(c["id"]) return max_id def insert_or_update_diff(self, diff): exist_diff = self.find_diff_from_previous(diff) if exist_diff: self.db[Mongo.PREV_DIFF_COL].find_one_and_update( {"diff": exist_diff["diff"]}, {'$set': { "count": int(exist_diff["count"]) + 1 }}) return self.find_diff_from_previous(diff) diff_id = self._get_previous_diff_max_id() + 1 new_record = {"diff": diff, "id": diff_id, "count": 1} self.db[Mongo.PREV_DIFF_COL].insert_one(new_record) return self.find_diff_from_previous(diff) def insert_previous_diff(self, diff): diff_id = 1 + self._get_previous_diff_max_id() self.db[Mongo.PREV_DIFF_COL].insert_one({ "id": diff_id, "diff": diff, "count": 1 }) def update_previous_diff(self, target): target_diff = self.find_diff_from_previous(target) self.db[Mongo.PREV_DIFF_COL].update_one( {"diff": target}, {'$set': { "count": int(target_diff["count"]) + 1 }})
class Batch: def __init__(self, programme, branch, section, year_of_pass): # THIS CONSTRUCTOR IS USED TO CREATE A REQUIRED COLLECTION # IN DATABASE. FOR EXAMPLE :- BATCH_BTECH_CSE_A_2021 # self._programme = programme self._branch = branch self._section = section self._year_of_pass = year_of_pass try: self.client = MongoClient(config.MongoDB_URI) db = self.client[config.Batch_DB] log(f'[ INFO ] {config.Batch_DB} Connected Successfully') except: log(f'[ ERROR ] Unable To Create Connection With {config.Batch_DB}' ) self.collection = db[f'{programme}_{branch}_{section}_{year_of_pass}'] def insert(self, enrollment): # USED TO INSERT ENROLLMENT OF A STUDENT IN THE REQUIRED COLLECTION # --------------------------------------------------------------------------- # DATA STRUCTURES OF ENROLLED_STUDENTS :- # ENROLLMENT --> STRING # # CHECKING FOR ANY DUPLICATE ENTRY IN THE COLLECTION duplicate_entry = self.collection.find_one({'enrollment': enrollment}) if duplicate_entry != None: log(f'[ ERROR ] {enrollment} Enrollment Insertion at {self._programme}_{self._branch}_{self._section}_{self._year_of_pass} Collection in Batch_DB failed - Duplicate Entry Found' ) return 417 else: status = self.collection.insert_one({'enrollment': enrollment}) log(f'[ INFO ] {status}') log(f'[ INFO ] {enrollment} Enrollment Inserted Successfully at {self._programme}_{self._branch}_{self._section}_{self._year_of_pass} Collection in {config.Batch_DB}' ) return 201 def remove(self, enrollment): # USED TO REMOVE ENROLLMENT OF A PARTICULAR STUDENT FROM BATCH COLLECTION # ---------------------------------------------------------------------------- # DATA STRUCTURES OF INPUT PARAMETER :- # ENROLLMENT --> STRING # try: status = self.collection.delete_one({'enrollment': enrollment}) log(f'[ INFO ] {status}') log(f'[ INFO ] {enrollment} Enrollment Removed From {self._programme}_{self._branch}_{self._section}_{self._year_of_pass} Collection in Batch_DB' ) return 220 except: return 203 log(f'[ ERROR ] Unable To Remove {enrollment} Enrollment From {self._programme}_{self._branch}_{self._section}_{self._year_of_pass} Collection in Batch_DB' ) def remove_all(self): # USED TO REMOVE WHOLE COLLECTION FOR WHICH BATCH CLASS # OBJECT IS INITIALISED. # ---------------------------------------------------------------------------- # try: self.collection.drop() log(f'[ INFO ] {self._programme}_{self._branch}_{self._section}_{self._year_of_pass} Collection Removed From Batch_DB' ) return 512 except: log(f'[ ERROR ] Unable To Remove {self._programme}_{self._branch}_{self._section}_{self._year_of_pass} Collection From Batch_DB' ) return 400 def show_all(self): # USED TO DISPLAY A LIST OF ALL THE ENROLLED STUDENTS IN A CLASS try: res = self.collection.find({}) if res.count() > 0: response = {'status': 302, 'res': res} else: response = {'status': 302, 'res': {}} log(f'[ INFO ] {self._programme}_{self._branch}_{self._section}_{self._year_of_pass} Collection Fetched From Batch_DB' ) except: response = {'status': 598, 'res': None} log(f'[ ERROR ] Unable To Fetch {self._programme}_{self._branch}_{self._section}_{self._year_of_pass} Collection From Batch_DB' ) return response def __del__(self): # log('[ INFO ] Connection closed successfully of Batch_DB.') self.client.close() # RELEASING OPEN CONNECTION WITH DATABASE
from pymongo import MongoClient client = MongoClient(host='localhost', port=27017) db = client['text'] # 数据库名字 db['inventory'].delete_one({}) # Delete operations do not drop indexes, even if deleting all documents from a collection db['inventory'].delete_many({}) db['inventory'].remove() client.close()
class Marksheet: def __init__(self, faculty_id, subject, programme, branch, section, year_of_pass, semester): # Constructor of marksheet accepts the following parameters: # faculty_id --> Unique Id of faculty --> string # subject --> subject name taught by the given faculty --> string # programme --> programme of the class whose marks are provided here --> string # branch --> like cse, IT etc --> string # section --> string # year_of_pass --> string # semester --> string # # Creating a collection in database with identifier like # "037_maths_btech_cse_a_2021_4". # try: self.client = MongoClient(config.MongoDB_URI) db = self.client[config.Marksheet_DB] log('[ INFO ] Marksheet_DB Connected Successfully') except: log('[ Error ] Unable To Create Connection With Marksheet_DB') sys.exit(0) self.collection = db[ f'{faculty_id}_{subject}_{programme}_{branch}_{section}_{year_of_pass}_{semester}'] def insert(self, marksheet_dictionary): # inserts marksheet dictionary that contains enrollment, marks and assessment # if it already isn't available in db. # ----------------------------------------------------------------------------------- # for example : # marksheet_dictionary = { # 'enrollment':'03720802717', # 'marks' : '29', # 'assessment':'8' # } # duplicate_entry = self.collection.find_one( {'enrollment': marksheet_dictionary['enrollment']}) if duplicate_entry != None: log('[ Error ] Object of this Enrollment Number already present in Database' ) return 417 else: status = self.collection.insert_one(marksheet_dictionary) log(f'[ INFO ] {status}') # Printing Status of result of query log('[ INFO ] Marks of the enrollment number inserted in Marksheet_DB.' ) return 201 def show_of(self, enrollment): # This method inputs enrollment and returns marks of that particular enrollment. # ------------------------------------------------------------------------------- # Data Structures of input parameter :- # enrollment --> string # try: res = list(self.collection.find({'enrollment': enrollment})) log('[ INFO ] Marks of the enrollment has been displayed.') response = {'status': '202', 'res': res} except: response = {'status': '404', 'res': 'NA'} return response def show_all(self): # This method doesn't takes any input and returns marks of all students. # ------------------------------------------------------------------------------- # try: res = list(self.collection.find({})) log('[ INFO ] Marks of all the students has been successfully displayed.' ) response = {'status': '302', 'res': res} except: response = {'status': '598', 'res': 'NA'} return response def remove(self, enrollment): # This method removes the collection of marks of a particular # enrollment from the class. # ---------------------------------------------------------------------------------- # Data Structures of input parameter :- # enrollment --> string # try: status = self.collection.delete_many({'enrollment': enrollment}) log(f'[ INFO ] {status}') # Printing Status of result of query log('[ INFO ] Marks of particular enrollment has been removed.') return 220 except: return 203 def update(self, enrollment, marksheet_dictionary): # This method use to update marks of a particular enrollment # with marksheet_dictionary object # -------------------------------------------------------------------------------- # Data Structures of the input parameters :- # enrollment --> string # marksheet_dictionary --> dictionary # searching_values = {'enrollment': enrollment} updation_value = marksheet_dictionary try: status = self.collection.update_many(searching_values, {'$set': updation_value}) log(f'[ INFO ] {status}') # Printing Status of result of query log('[ INFO ] Marksheet_DB has been updated.') return 301 except: return 204 def __del__(self): self.client.close()
min_profit_ratio_arr[cur_index] = profit_ratio # 累加利润值 profit_ratio_arr[cur_index] += profit_ratio # 画图 # 求profit平均值 for index in range(len(profit_ratio_arr)): profit_ratio_arr[index] = round(profit_ratio_arr[index] / (buck_num+1), 2) plt.xlabel('x') plt.ylabel('y') plt.title(u'value ratio_for_%s, 单个总数:%s' % (buy_result, limit_num), fontproperties=font_set) plt.plot(index_arr, profit_ratio_arr) # 横坐标描述 plt.xlabel(u'主客身价比', fontproperties=font_set) # 纵坐标描述 plt.ylabel(u'利润', fontproperties=font_set) for a, b in zip(index_arr, profit_ratio_arr): plt.text(a, b, b, ha='center', va='bottom', fontsize=10) # plt.xticks(index_arr) plt.legend() # 展示最低利润值 print('最低利润为:\n') for i in range(0, len(min_profit_ratio_arr)): print('%.2f, ' % min_profit_ratio_arr[i]) plt.show() except Exception as err: print('%s\n%s' % (err, traceback.format_exc())) finally: mongo_client.close()
class YunqicrawlPipeline(object): # def __init__(self, mongo_uri, mongo_db, replicaset): # self.mongo_uri = mongo_uri # self.mongo_db = mongo_db # self.replicaset = replicaset # # @classmethod # def from_crawler(cls, crawler): # return cls(mongo_uri=crawler) def open_spider(self, spider): self.client = MongoClient() self.collection = self.client["yunqi"]["book"] def close_spider(self, spider): self.client.close() def process_item(self, item, spider): if isinstance(item, YunqiBookListItem): # pass # self._precess_booklist_item(item) # else: # print("==================") # print(item) self._precess_bookeDetail_item(item) return item def _precess_booklist_item(self, item): """ 处理小说信息 :param item: :return: """ self.collection.insert(dict(item)) def _precess_bookeDetail_item(self, item): """ 处理小说热度 :param item: :return: """ pattern = re.compile("\d+") item["novelLabel"] = item["novelLabel"].strip().replace("\n", "") match = pattern.search(item["novelAllClick"]) item["novelAllClick"] = match.group( ) if match else item["novelAllClick"] match = pattern.search(item["novelMonthClick"]) item["novelMonthClick"] = match.group( ) if match else item["novelMonthClick"] match = pattern.search(item["novelWeekClick"]) item["novelWeekClick"] = match.group( ) if match else item["novelWeekClick"] match = pattern.search(item["novelAllPopular"]) item["novelAllPopular"] = match.group( ) if match else item["novelAllPopular"] match = pattern.search(item["novelMonthPopular"]) item["novelMonthPopular"] = match.group( ) if match else item["novelMonthPopular"] match = pattern.search(item["novelWeekPopular"]) item["novelWeekPopular"] = match.group( ) if match else item["novelWeekPopular"] match = pattern.search(item["novelAllComm"]) item["novelAllComm"] = match.group() if match else item["novelAllComm"] match = pattern.search(item["novelMonthComm"]) item["novelMonthComm"] = match.group( ) if match else item["novelMonthComm"] match = pattern.search(item["novelWeekComm"]) item["novelWeekComm"] = match.group( ) if match else item["novelWeekComm"] self.collection.insert(dict(item))
# _*_ coding:utf-8 _*_ # !/usr/bin/python from pymongo import MongoClient mc = MongoClient('localhost', 27017) # 连接数据库 db = mc.mydb # use mydb数据库 db.user.save({'name': '张三', 'age': 90}) # 将记录写入表 # 查询记录 print([d for d in db.user.find({'name': 'chengxudong'})]) data_obj = db.user.find() for o in data_obj: print(o) print([d for d in data_obj]) mc.close() # 关闭与数据库连接
from pymongo import MongoClient # Por padrão o host é localhost e porta 27017 # Estes valores podem ser modificados passando uma URI # client = MongoClient("mongodb://localhost:27017/") client = MongoClient() db = client.catalogue documents = [ { "title": "A Light in the Attic", }, { "title": "Tipping the Velvet", }, { "title": "Soumission", }, ] db.books.insert_many(documents) client.close() # fecha a conexão com o banco de dados
b = ax.bar(x, list_People, color='k', alpha=0.8, tick_label=list_name) #print(type(b)) #datas = pd.Series(list_People, index=list_name) #c = datas.plot.bar(color='k', alpha=0.8) # 垂直柱状图 #print(type(c)) for i in b: h = i.get_height() ax.text(i.get_x() + i.get_width() / 2, h, '%.4f' % h, ha='center', va='bottom') plt.xticks(rotation=15) plt.xlabel("店铺名称") plt.ylabel("购买人数/10000") plt.title("销售分析") plt.ylim(0, 10) plt.show() if __name__ == "__main__": global db global sntable global table table = 'TaoBaoLipstick' mconn = MongoClient("mongodb://localhost") db = mconn['test'] db.authenticate('test', 'test') pandas_data() mconn.close()
def test_register(request): if request.method == "POST": email = request.POST.get('email') email2 = request.POST.get('email2') username1 = request.POST.get('username1') username2 = request.POST.get('username2') pwd = request.POST.get('pwd') pwd2 = request.POST.get('pwd2') sex = request.POST.get('sex') organization = request.POST.get('organization') research = request.POST.get('research') title = request.POST.get('title') age = request.POST.get('age') qq = request.POST.get('qq') wechat = request.POST.get('wechat') blog = request.POST.get('blog') if email != email2: return HttpResponse('两次注册邮箱不一致') elif len(str(email)) == 0: return HttpResponse('提交失败') elif pwd != pwd2: return HttpResponse('两次密码不一致') else: __db_server, __db_port = '127.0.0.1', 27017 client = MongoClient(__db_server, __db_port) db = client['userinfo'] try: user = db.userinfo.find_one({'_id': email}) if user is None: db.trial1.insert_one({ '_id': email, '姓': username1, '名': username2, '密码': pwd, '密码2': pwd2, '性别': sex, '工作/学习单位': organization, '专业/研究方向': research, '职称': title, '年龄': age, 'qq': qq, 'wechat': wechat, '个人主页': blog, }) send_register_email(email, 'register') client.close() return HttpResponse('邮箱验证已发送') else: db.trial1.update_one({"_id": email}, { '$set': { '姓': username1, '名': username2, '密码': pwd, '密码2': pwd2, '性别': sex, '工作/学习单位': organization, '专业/研究方向': research, '职称': title, '年龄': age, 'qq': qq, 'wechat': wechat, '个人主页': blog, } }, upsert=None) send_register_email(email, 'register') client.close() return HttpResponse('邮箱验证已发送') except KeyError: message = "该邮箱已注册" client.close() return render(request, 'login.html', {'msg': message}) else: return render(request, 'register.html')
class ZhihuPipeline(object): """ 存储数据 """ def __init__(self, mongo_uri, mongo_db, image_dir): self.mongo_uri = mongo_uri self.mongo_db = mongo_db self.image_dir = image_dir self.client = None self.db = None @classmethod def from_crawler(cls, crawler): return cls(mongo_uri=MONGO_URI, mongo_db='zhihu', image_dir=os.path.join(PROJECT_DIR, 'images')) def open_spider(self, spider): self.client = MongoClient(self.mongo_uri) self.db = self.client[self.mongo_db] if not os.path.exists(self.image_dir): os.mkdir(self.image_dir) def close_spider(self, spider): self.client.close() def _process_people(self, item): """ 存储用户信息 """ collection = self.db['people'] zhihu_id = item['zhihu_id'] collection.update({'zhihu_id': zhihu_id}, dict(item), upsert=True) image_url = item['image_url'] if image_url and zhihu_id: image_path = os.path.join(self.image_dir, '{}.jpg'.format(zhihu_id)) download_pic.delay(image_url, image_path) def _process_relation(self, item): """ 存储人际拓扑关系 """ collection = self.db['relation'] data = collection.find_one({ 'zhihu_id': item['zhihu_id'], 'user_type': item['user_type'] }) if not data: self.db['relation'].insert(dict(item)) else: origin_list = data['user_list'] new_list = item['user_list'] data['user_list'] = list(set(origin_list) | set(new_list)) collection.update( { 'zhihu_id': item['zhihu_id'], 'user_type': item['user_type'] }, data) def process_item(self, item, spider): """ 处理item """ if isinstance(item, ZhihuPeopleItem): self._process_people(item) elif isinstance(item, ZhihuRelationItem): self._process_relation(item) return item
class Dao: """ Data access class.""" def __init__(self, host: str, database: str): """ Create new DAO atop of MongoClient""" self.client = MongoClient(host) self.database = database self.POSTS = "posts" self.USERS = "users" def get_default_query(self): return {"partition_id": 1} def insert_posts(self, posts: [dict]): for post in posts: post.setdefault('partition_id', 1) self.client \ .get_database(name=self.database) \ .get_collection(name=self.POSTS) \ .insert_many(posts) def insert_users(self, users: [dict]): for user in users: user.setdefault('partition_id', 1) self.client \ .get_database(name=self.database) \ .get_collection(name=self.USERS) \ .insert_many(users) # for user in users: # print(user) # self.client \ # .get_database(name=self.database) \ # .get_collection(name=self.USERS) \ # .insert_one(user) def select_user_by(self, username=None, userid=None): query = self.get_default_query() if username is not None: query['username'] = username if userid is not None: query['id'] = userid result = self.client \ .get_database(name=self.database) \ .get_collection(name=self.USERS) \ .find_one(query) return result def delete_all_posts(self): self.client \ .get_database(name=self.database) \ .get_collection(self.POSTS) \ .delete_many(self.get_default_query()) def delete_all_users(self): self.client \ .get_database(name=self.database) \ .get_collection(self.USERS) \ .delete_many(self.get_default_query()) def close(self): self.client.close()
def before_scenario(context, scenario): client = MongoClient(context.mongo_url) mongo_db = client.get_database('database') mongo_db['collection1'].delete_many({}) client.close()
class MongoDB(object): """main script class""" # pylint: disable=too-many-instance-attributes def __init__(self): self.mongo_host = "127.0.0.1" self.mongo_port = 27017 self.mongo_db = [ "admin", ] self.mongo_user = None self.mongo_password = None self.__conn = None self.__dbnames = None self.__metrics = [] def connect(self): """Connect to MongoDB""" if self.__conn is None: if self.mongo_user is None: try: self.__conn = MongoClient( 'mongodb://%s:%s' % (self.mongo_host, self.mongo_port)) except errors.PyMongoError as py_mongo_error: print('Error in MongoDB connection: %s' % str(py_mongo_error)) else: try: self.__conn = MongoClient( 'mongodb://%s:%s@%s:%s' % (self.mongo_user, self.mongo_password, self.mongo_host, self.mongo_port)) except errors.PyMongoError as py_mongo_error: print('Error in MongoDB connection: %s' % str(py_mongo_error)) def add_metrics(self, k, v): """add each metric to the metrics list""" dict_metrics = {} dict_metrics['key'] = k dict_metrics['value'] = v self.__metrics.append(dict_metrics) def print_metrics(self): """print out all metrics""" metrics = self.__metrics for metric in metrics: zabbix_item_key = str(metric['key']) zabbix_item_value = str(metric['value']) print(zabbix_item_key + ' ' + zabbix_item_value) def get_db_names(self): """get a list of DB names""" if self.__conn is None: self.connect() db_handler = self.__conn[self.mongo_db[0]] master = db_handler.command('isMaster')['ismaster'] dict_metrics = {} dict_metrics['key'] = 'mongodb.ismaster' if master: dict_metrics['value'] = 1 db_names = self.__conn.list_database_names() self.__dbnames = db_names else: dict_metrics['value'] = 0 self.__metrics.append(dict_metrics) def get_mongo_db_lld(self): """print DB list in json format, to be used for mongo db discovery in zabbix""" if self.__dbnames is None: db_names = self.get_db_names() else: db_names = self.__dbnames dict_metrics = {} db_list = [] dict_metrics['key'] = 'mongodb.discovery' dict_metrics['value'] = {"data": db_list} if db_names is not None: for db_name in db_names: dict_lld_metric = {} dict_lld_metric['{#MONGODBNAME}'] = db_name db_list.append(dict_lld_metric) dict_metrics['value'] = '{"data": ' + json.dumps(db_list) + '}' self.__metrics.insert(0, dict_metrics) def get_server_status_metrics(self): """get server status""" if self.__conn is None: self.connect() db_handler = self.__conn[self.mongo_db[0]] ss = db_handler.command('serverStatus') # db info self.add_metrics('mongodb.version', ss['version']) self.add_metrics('mongodb.storageEngine', ss['storageEngine']['name']) self.add_metrics('mongodb.uptime', int(ss['uptime'])) self.add_metrics('mongodb.okstatus', int(ss['ok'])) # asserts for k, v in ss['asserts'].items(): self.add_metrics('mongodb.asserts.' + k, v) # operations for k, v in ss['opcounters'].items(): self.add_metrics('mongodb.operation.' + k, v) # memory for k in ['resident', 'virtual', 'mapped', 'mappedWithJournal']: self.add_metrics('mongodb.memory.' + k, ss['mem'][k]) # connections for k, v in ss['connections'].items(): self.add_metrics('mongodb.connection.' + k, v) # network for k, v in ss['network'].items(): self.add_metrics('mongodb.network.' + k, v) # extra info self.add_metrics('mongodb.page.faults', ss['extra_info']['page_faults']) #wired tiger if ss['storageEngine']['name'] == 'wiredTiger': self.add_metrics( 'mongodb.used-cache', ss['wiredTiger']['cache']["bytes currently in the cache"]) self.add_metrics( 'mongodb.total-cache', ss['wiredTiger']['cache']["maximum bytes configured"]) self.add_metrics( 'mongodb.dirty-cache', ss['wiredTiger']['cache']["tracked dirty bytes in the cache"]) # global lock lock_total_time = ss['globalLock']['totalTime'] self.add_metrics('mongodb.globalLock.totalTime', lock_total_time) for k, v in ss['globalLock']['currentQueue'].items(): self.add_metrics('mongodb.globalLock.currentQueue.' + k, v) for k, v in ss['globalLock']['activeClients'].items(): self.add_metrics('mongodb.globalLock.activeClients.' + k, v) def get_db_stats_metrics(self): """get DB stats for each DB""" if self.__conn is None: self.connect() if self.__dbnames is None: self.get_db_names() if self.__dbnames is not None: for mongo_db in self.__dbnames: db_handler = self.__conn[mongo_db] dbs = db_handler.command('dbstats') for k, v in dbs.items(): if k in [ 'storageSize', 'ok', 'avgObjSize', 'indexes', 'objects', 'collections', 'fileSize', 'numExtents', 'dataSize', 'indexSize', 'nsSizeMB' ]: self.add_metrics( 'mongodb.stats.' + k + '[' + mongo_db + ']', int(v)) def close(self): """close connection to mongo""" if self.__conn is not None: self.__conn.close()
class OTP: def __init__(self): try: self.client = MongoClient(config.MongoDB_URI) db = self.client[config.OTP_DB] log(f'[ INFO ] {config.OTP_DB} Connected Successfully') except: log(f'[ ERROR ] Unable To Create Connection With {config.OTP_DB}') self.collection = db[config.OTP_COLLECTION] def insert(self, hash_id, otp, function): # USED TO INSERT OTP FOR A PARTICULAR USERID # --------------------------------------------------------------------------- # DATA STRUCTURES OF ENROLLED_STUDENTS :- # HASH_ID --> STRING # OTP --> INTEGER # FUNCTION --> STRING # # CHECKING THE PRESENCE OF DUPLICATE ENTRY IN DATABASE try: res = self.collection.find({'hash_id': hash_id}) if res.count() > 0: ## CHECKING WHETHER SAME FUNCTIONALITY EXISTS IN THE DUPLICATE RESULTS for document in res: if document['function'] == function: log(f'[ INFO ] For Hash ID - {hash_id} Duplicate Entry Found at {config.OTP_COLLECTION} Collection in {config.OTP_DB}' ) status = self.collection.delete_one( {'hash_id': hash_id}) log(f'[ INFO ] {status}') log(f'[ INFO ] Hash_ID - {hash_id} Removed Successfully from {config.OTP_COLLECTION} Collection in {config.OTP_DB}' ) status = self.collection.insert_one({ 'hash_id': hash_id, 'otp': otp, 'function': function }) log(f'[ INFO ] {status}') log(f'[ INFO ] For Hash_ID - {hash_id} OTP Inserted Successfully at {config.OTP_COLLECTION} Collection in {config.OTP_DB}' ) return 201 except Exception as e: log(f'[ ERROR ] Unable To Insert Document For User_ID - {user_id} at {config.OTP_COLLECTION} Collection in {config.OTP_DB}' ) return 417 def query(self, query_param, query_value): # THIS QUERY FUNCTION INPUTS QUERY PARAMETER LIKE USERID AND QUERY VALUE TO SEARCH # IN COLLECTION. AFTER SUCCESSFUL SEARCH, IT RETURNS RESULT COMBINED WITH STATUS VALUE # ----------------------------------------------------- # DATA STRUCTURE OF INPUT PARAMETER :- # QUERY_PARAMETER --> STRING # QUERY_VALUE --> STRING # res = self.collection.find({query_param: query_value}) if res.count() > 0: ## RUNS WHEN ANY RESULT COMES response = {'status': 212, 'res': res} else: response = {'status': 206, 'res': None} log(f'[ INFO ] The Search Query Completed Successfully in {config.OTP_DB}' ) return response def remove(self, hash_id, function): # USED TO REMOVE DOCUMENT CARRYING USERID AND OTP GENERATED FOR THAT USERID # ---------------------------------------------------------------------------- # DATA STRUCTURES OF INPUT PARAMETER :- # HASH_ID --> STRING # FUNCTION --> STRING try: status = self.collection.delete_one({ 'hash_id': hash_id, 'function': function }) log(f'[ INFO ] {status}') log(f'[ INFO ] Hash_ID - {hash_id} Removed Successfully from {config.OTP_COLLECTION} Collection in {config.OTP_DB}' ) return 220 except: return 203 log(f'[ ERROR ] Unable To Remove Hash_ID - {hash_id} from {config.OTP_COLLECTION} Collection in {config.OTP_DB}' ) def __del__(self): self.client.close() # RELEASING OPEN CONNECTION WITH DATABASE
# This file read and push a json file in a list # and insert it in a collection in a db. import json from pymongo import MongoClient # connect to the MongoDB connection = MongoClient("mongodb://localhost:27017/") # connect to the UnilPlan database and the Classes collection db = connection.db_unilplan.classes # open .json, and make a list with them with open('crawler/crawler/JSON_output_files/Courses.json', encoding='utf-8') as json_data: classes = {} classes = json.load(json_data) json_data.close() print(".json = ok") # insert the data in the db db.insert(classes) print("correctly added") # close the connection to MongoDB connection.close()
replhosts[r].append(hp) if len(dbhosts) > 0: dbhosts = dbhosts.rstrip(',') ctx.instance.runtime_properties['dbhosts'] = dbhosts ctx.logger.info("Set dbhosts to ({})".format(dbhosts)) ################################################################## # Initialize replica sets ################################################################## # for each replicaset ctx.logger.info("replosts size:{}".format(len(replhosts))) for k, v in replhosts.iteritems(): ctx.logger.info("replhost:{}".format(key)) if (len(v) > 0): config = {'_id': k, 'members': []} for i, h in enumerate(v): config['members'].append({'_id': i, 'host': h}) h, p = v[0].split(":") c = MongoClient(h, int(p)) ctx.logger.info("initiating replicaset:{}".format(str(config))) try: c.admin.command("replSetInitiate", config) except: pass c.close()
class Scraper(object): def __init__(self,tor, zip_code): """ Sets the data """ self._tor = tor # test tor self._zip_code = zip_code #Zillow database for housing description self._client = MongoClient('localhost', 27017) db = self._client.Zillow self._housing_description = db.housing_description #run the scraper self._get_zip_data() def _get_property_summary(self,soup): """ Needed to change this function a little from the scaper since Otherwise it hangs on some functions Given a soup it populates the results dic and returns it """ def parse_property(regex, property_): try: results[property_] = re.findall(regex, prop_summary)[0] except IndexError: results[property_] = None def parse_propetry2(string,property_): try: results[property_] = prop_summary.split(string)[1].split('"')[1] except IndexError: results[property_] = None prop_summary = soup.find("div", class_=constants.PROP_SUMMARY_CLASS) prop_summary = prop_summary.text results = {} parse_property(r"([\d\.]+) beds?", "bedrooms") parse_property(r"([\d\.]+) baths?", "bathrooms") parse_property(r"([\d,\.]+) sqft", "sqft") #these to lines don't always work it seems to hang #parse_property(r"((?:[A-Z]\w+ ?){1,}), [A-Z]{2}", "city") #parse_property(r"(?:[A-Z]\w+ ?){1,}, ([A-Z]{2})", "state") parse_propetry2('"city":','city') parse_propetry2('"state":','state') parse_property(r"[A-Z]{2} (\d{5}-?(?:\d{4})?)", "zipcode") return results def _get_price_tax_url(self, soup): """ Given the soup of the housing details html this will find and return the ajaxURL for both price history and tax history """ groups = soup.text.split('ajaxURL') price_history, tax_history = None, None for group in groups[1:-1]: group = group.split(";")[0] if 'divId:"hdp-price-history"' in group: price_history = "http://www.zillow.com" + group.split('"')[1:2][0] elif 'divId:"hdp-tax-history"' in group: tax_history = "http://www.zillow.com" + group.split('"')[1:2][0] return price_history, tax_history def _populate_price_and_tax_histories(self,soup, results): """ Change the code a little from scrapezillow.scraper Given a beatifulsoup it will use tor to request the data and populate the price and tax history """ #get price and tax urls price_url, tax_url = self._get_price_tax_url(soup) ##populate price and tax history html = self._tor.request(price_url) soup = BeautifulSoup(html) results["price_history"] = self._get_price_history(soup) html = self._tor.request(tax_url) soup = BeautifulSoup(html) results["tax_history"] = self._get_tax_history(soup) def _get_price_history(self,soup): """ Change the code a little from scrapezillow.scraper Given a beatifulsoup it will populate the price history """ data =[] try: table_body = soup.find('table') rows = table_body.find_all('tr') for row in rows: try: cols = row.find_all('td') cols = [ele for ele in cols] date = cols[0].get_text() event = cols[1].get_text() price_span = cols[2].find('span') if not price_span: price = None else: price = price_span.get_text() data.append([date, event, price]) except: pass # undesired data except: pass #no table found return data def _get_tax_history(self,soup): """ Change the code a little from scrapezillow.scraper Given a beatifulsoup it will populate the tax history """ data = [] try: table_body = soup.find('table') rows = table_body.find_all('tr') for row in rows: try: cols = row.find_all('td') cols = [ele for ele in cols] date = cols[0].get_text() tax = cols[1].contents[0] assessment = cols[3].get_text() data.append([date, tax, assessment]) except: pass # undesired data except: pass ##No table found return data def _scrape(self,html,url): """ Scrape a specific zillow home. Takes either a url or a zpid. If both/neither are specified this function will throw an error. """ soup = BeautifulSoup(html, 'html.parser') results = self._get_property_summary(soup) results['url'] = url facts = scraper._parse_facts(scraper._get_fact_list(soup)) results.update(**facts) results.update(**scraper._get_sale_info(soup)) results["description"] = scraper._get_description(soup) results["photos"] = scraper._get_photos(soup) self._populate_price_and_tax_histories(soup, results) return results def _has_next(self,soup): """ Looks for the Next button on the webpage to see if more houses """ if soup == None: return True return len(soup.findAll("li", { "class" : "zsg-pagination-next" }))==1 def _get_house_links(self,soup): """ Adds house details into the mongo database """ for address in soup.findAll("dt", { "class" : "property-address" }): url = 'http://www.zillow.com'+address.find('a')['href'] # Look up if already in the database if self._housing_description.find_one({'url':url})==None: try: print url html = self._tor.request(url) self._housing_description.insert(self._scrape(html,url)) #sleep(1) except: ## scrape failed ## missing data so just add url to not try to add again self._housing_description.insert({'url':url}) def _get_zip_data(self): """ Finds the housing data for the zip """ soup = None page = 1 print 'Zip code:',self._zip_code,' started.' while(self._has_next(soup)): url = 'http://www.zillow.com/homes/for_rent/'+str(self._zip_code)+'_rb/'+str(page)+'_p' r = self._tor.request(url) print 'Url received: ', url soup = BeautifulSoup(r) self._get_house_links(soup) page += 1 sleep(1) print 'Zip code:',self._zip_code,' finished.' self._client.close()
class GenerateMongo(object): mongo_host = "172.31.10.53" mongo_port = 27017 mongo = None mongodb = None def __init__(self): # self.logger = logger self.mongo = MongoClient(host=self.mongo_host, port=self.mongo_port) self.mongodb = self.mongo["rap"] def get_req_param(self, action_id=155): req = self.mongodb["tb_request_parameter_list_mapping"].find( {"action_id": action_id}, { "parameter_id": 1, "_id": 0 }) req_arr = list(map(lambda x: x["parameter_id"], req)) param = self.mongodb["tb_parameter"].find({"id": {"$in": req_arr}}) return list( map( lambda x: { "id": x["id"], "name": x["name"], "identifier": x["identifier"], "data_type": x["data_type"] }, param)) def get_res_param(self, action_id=155): res = self.mongodb["tb_response_parameter_list_mapping"].find( {"action_id": action_id}, { "parameter_id": 1, "_id": 0 }) # print(res) res_arr = list(map(lambda x: x["parameter_id"], res)) # print(res_arr) param = self.mongodb["tb_parameter"].find({"id": {"$in": res_arr}}) return list( map( lambda x: { "id": x["id"], "name": x["name"], "identifier": x["identifier"], "data_type": x["data_type"] }, param)) def get_complex_param(self, complex_parameter_id=7981): complex = self.mongodb["tb_complex_parameter_list_mapping"].find( {"complex_parameter_id": complex_parameter_id}, { "parameter_id": 1, "_id": 0 }) complex_arr = list(map(lambda x: x["parameter_id"], complex)) param = self.mongodb["tb_parameter"].find({"id": {"$in": complex_arr}}) return list( map( lambda x: { "id": x["id"], "name": x["name"], "identifier": x["identifier"], "data_type": x["data_type"] }, param)) def recursion_param(self, param): map = {} for p in param: data_type = p["data_type"] identifier = p["identifier"] if (not data_type == "object" and not data_type == "array<object>"): if ("array<" in data_type): map[identifier] = [str(p["name"])] else: map[identifier] = p["name"] elif (data_type == "object"): sub_param = self.get_complex_param(p["id"]) sub_map = self.recursion_param(sub_param) map[identifier] = sub_map elif (data_type == "array<object>"): sub_param = self.get_complex_param(p["id"]) sub_map = self.recursion_param(sub_param) map[identifier] = [sub_map] return map def insert_mongo(self, doc): actionId = doc["actionId"] self.mongodb["my_rap"].remove({"actionId": actionId}) self.mongodb["my_rap"].insert(doc) # self.mongodb["my_rap"].update({"actionId":actionId}, {"$set": doc}, upsert=True) def __del__(self): self.mongo.close()
def transform(self, X, **transform_params): # connect to db mongoClient = MongoClient('localhost', 27017) ffCorpus = mongoClient.FACTFEELCorpus # Sentence's table documentCollection = ffCorpus.documents temp = [] if self.featureSetConfiguration == 0: # not active temp = [[ 0 for f in sorted(SubjectivityLexiconTransformer.features.keys()) ] for s in X] elif self.featureSetConfiguration == 1: # (weaksubj|strongsubj)-(both|neutral|positive|negative) for document in X: features_to_set = { 'weaksubj-both': False, 'weaksubj-neutral': False, 'weaksubj-positive': False, 'weaksubj-negative': False, 'strongsubj-both': False, 'strongsubj-neutral': False, 'strongsubj-positive': False, 'strongsubj-negative': False } currentSentence = documentCollection.find_one( {'document_id': document}) raw_sentence = currentSentence['raw'].lower() features_in_sentence = sc.analyse_sentence(raw_sentence) if features_in_sentence != []: for feat in features_in_sentence: features_to_set[feat] = True test = [ features_to_set[key] for key in sorted(features_to_set.keys()) ] temp.append(test) #TODO : 2 elif self.featureSetConfiguration == 2: # weak|strong for document in X: features_to_set = {'weaksubj': False, 'strongsubj': False} currentSentence = documentCollection.find_one( {'document_id': document}) raw_sentence = currentSentence['raw'].lower() features_in_sentence = sc.analyse_sentence(raw_sentence) if features_in_sentence != []: for feat in features_in_sentence: if re.findall('weaksubj', feat) != []: features_to_set['weaksubj'] = True elif re.findall('strongsubj', feat) != []: features_to_set['strongsubj'] = True test = [ features_to_set[key] for key in sorted(features_to_set.keys()) ] temp.append(test) #TODO : 3 (polar | neutral) features = np.array(temp) #print('SubjectivityLexiconTransformer:' , self.featureSetConfiguration,' ### X:',len(X),'len(features):',len(features)) mongoClient.close() return features
def main(): reload(sys) sys.setdefaultencoding('utf-8') companies = ['apple', 'google', 'samsung'] db_names = ['twitter_apple_db', 'twitter_google_db', 'twitter_samsung_db'] collection_names = [ 'twitter_apple_collection', 'twitter_google_collection', 'twitter_samsung_collection' ] #output dir outPath = "/data/analysis_output" for i in range(len(companies)): print "Retrieving " + companies[i] + " collection from db..." outCSV = outPath + "/" + companies[i] + "_result.csv" #connect to mongo client = MongoClient() db = client[db_names[i]] collection = db[collection_names[i]] df = pd.DataFrame(list(collection.find())) print 'The dimension of data frame is ' + str( df.shape[0]) + ' x ' + str(df.shape[1]) #print df.head(5) #sys.exit() ################################### #if reading from raw json files #opts = parse_args() #if opts.debug: # logging.basicConfig(level=logging.DEBUG) #else: # logging.basicConfig(level=logging.INFO) #get all the json files in the directory and aggregate in one df #fmask = os.path.join(opts.directory[0], '*.json') #df = get_merged_json(glob.glob(fmask),ignore_index=True) ################################### df_norm = dfCleanUp(df) df_norm = applyFilters(df_norm) #apply sentiment scoring print "Applying sentiment analysis..." compound = [] pos = [] neg = [] neu = [] for sentence in df_norm.text: ss = sentimentScoring(sentence) compound.append(ss['compound']) pos.append(ss['pos']) neg.append(ss['neg']) neu.append(ss['neu']) df_norm['ss_compound'] = compound df_norm['ss_pos'] = pos df_norm['ss_neg'] = neg df_norm['ss_neu'] = neu #add the brand company name to field df_norm['brand'] = companies[i] #print(sentence) #print(ss) #output to csv print "Writing to csv..." writeDFtoCSV(df_norm, outCSV) print "done." client.close()
mongo_host = os.getenv('MONGO_HOST') or 'localhost' logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO) while True: mongo = MongoClient(mongo_host) db = mongo['okdiariocom-bot'] delete_older_than(12, db) new_comment_count = 0 post_count = 0 for post in db['posts'].find(): post_count = post_count + 1 comments = get_comments(post['url']) for comment in comments: query = db['comments'].find_one({ 'comment_id': comment['comment_id'], 'post_id': comment['post_id'] }) if query is None: if comment['posted_at'] > (time.time() - 12 * 3600): logging.info('New comment: ' + comment['body']) db['comments'].insert_one(comment) new_comment_count = new_comment_count + 1 mongo.close() logging.info( str(new_comment_count) + ' new comments found. ' + str(post_count) + ' posts processed.') time.sleep(13 * 60)
class MongoDB: def __init__(self, db_config=None, collection=None, db=None): global config self.client = None self.db = None self.collection = None self.connected = False if db_config is None: if os.environ.get('APP_RUNTIME_CONTEXT') == 'dev': db_config = config['mongo.dev'] self.environ = 'dev' elif os.environ.get('APP_RUNTIME_CONTEXT') == 'qa': db_config = config['mongo.qa'] self.environ = 'qa' else: db_config = config['mongo.prod'] self.environ = 'prod' log.info('Using mongo.{} configuration.'.format(self.environ)) else: log.info('Using db_config provided: {}'.format(db_config)) if db_config: self.client = MongoClient( 'mongodb+srv://{}:{}@{}/{}?retryWrites=true&w=majority'.format( db_config['username'], db_config['password'], db_config['host'], db_config['database'])) # -- setup database if self.db: self.db = self.client[db] else: if db_config.get('database'): self.db = self.client[db_config['database']] else: self.db = self.client['admin'] # -- setup collection if collection: self.collection = self.db[collection] else: if db_config.get('collection'): self.collection = self.db[db_config['collection']] else: self.collection = self.db['system.version'] self.connected = True if self.connected: log.info('CONNECTED to {}@{}'.format(self.db.name, db_config['host'])) else: log.info('NOT CONNECTED. (db={}, host={})'.format( db_config['database'], db_config['host'])) def close(self): if self.status(): self.client.close() self.connected = False log.info('DISCONNECTED.') def status(self): r = False if self.client.server_info(): if isinstance(self.db.name, str): r = self.connected = True return r
class MongoAnalysis(object): def __init__(self, tbname=None, saved_file_type=None): if tbname is None: raise ValueError("Not get a tbname!") self.tbname = tbname self.title = self.get_title() self.conn = MongoClient("localhost:27017", connect=True) self.db = self.conn['DBMovie'] self.collection = self.db[self.tbname] self.style = Style(title_color='#fff', title_pos="center", width=1200, height=600, background_color='#404a59') self.saved_file_type = saved_file_type self.path = './img/{}/finished'.format(self.tbname) if not os.path.exists(self.path): os.makedirs(self.path) def get_title(self): ''' get the title of the movie :return: ''' search_number = re.compile(r"\d+").findall(self.tbname)[0] title = GetMvInfo().get_title(search_number) if not title: title = self.tbname return title def GetOneCol(self, name, method=None): ''' give a name to search the column name in mongodb :param name: colname,such as "comment_content". :param method: if method is None,we should remove the null data. if method is 'average',we should fill the null data with the mean value. :return: nonempty set ''' if method is None: return [ comments[name].strip() for comments in self.collection.find() if comments[name] is not None ] elif method == "average": com_lst = [ comments[name].strip() for comments in self.collection.find() if comments[name] is not None ] aver = reduce(lambda x, y: x + y, map(int, com_lst)) / len(com_lst) result = [] for comments in self.collection.find(): if comments[name]: result.append(int(comments[name].strip())) else: result.append(aver) return result def AreaMap(self): ''' :return: a area map on chinese map downloaded maps: pip install echarts-countries-pypkg pip install echarts-china-provinces-pypkg pip install echarts-china-cities-pypkg pip install echarts-china-counties-pypkg pip install echarts-china-misc-pypkg pip install echarts-china-kingdom-pypkg ''' # filter other countries' users city = dict(Counter(self.GetOneCol(name="city"))) filter_city = { key: city[key] for key in city.keys() if re.compile(r'[\u4e00-\u9fa5]+').search(key[0]) } key_map = [ "河北", "山西", "辽宁", "吉林", "黑龙江", "江苏", "浙江", "安徽", "福建", "江西", "山东", "河南", "湖北", "湖南", "广东", "海南", "四川", "贵州", "云南", "陕西", "甘肃", "青海", "台湾" ] k_lst, v_lst = [], [] for key in sorted(list(filter_city.keys())): v_lst.append(filter_city[key]) if "," in key: key = key.split(",")[0].strip() for province in key_map: if province in key: key = key.replace(province, "").strip() k_lst.append(key) v_max = max(v_lst) geo = Geo(self.title, "数据来源:豆瓣电影", **self.style.init_style) geo.add( "", k_lst, v_lst, type='effectScatter', #other styles:scatter or heatmap visual_range=[0, v_max], visual_range_text="#fff", symbol_size=15, is_visualmap=True) if self.saved_file_type is None: geo.render(os.path.join(self.path, "AreaMap.png")) elif self.saved_file_type == "html": geo.render(os.path.join(self.path, "AreaMap.html")) def GetStars(self, star_score): ''' search for grades with star_score :param star_score: int :return: a list of grades ''' breakpoints = [11, 21, 31, 41, 51] grades = ["一星", "二星", "三星", "四星", "五星"] return grades[bisect.bisect(breakpoints, star_score)] def StarMap(self): ''' Get a pie map :return: ''' score = dict( Counter( map(self.GetStars, self.GetOneCol(name="comment_score", method="average")))) attr, value = Geo.cast(score) pie = Pie(self.title, "数据来源:豆瓣电影", title_pos="center", width=900) pie.add("", attr, value, center=[50, 50], is_random=True, radius=[30, 75], rosetype="area", is_legend_show=False, is_label_show=True) if self.saved_file_type is None: pie.render(os.path.join(self.path, "StarMap.png")) elif self.saved_file_type == "html": pie.render(os.path.join(self.path, "StarMap.html")) def Cast(self, name, method=None, message=None, max_bin=100): ''' casts data ,and filters data with stopwords :param name: colname :param method: to decide the func returns a dict or tuple(attr,value) :param message: a message the user gives,if not None, will be adding to stopwords :param max_bin: the max number of words on wordcloud :return: ''' string = "".join(self.GetOneCol(name)) brokewords = map( str.strip, open('./config/stopwords/stopwords.txt', "r", encoding="utf-8").readlines()) if message: brokewords = itertools.chain(brokewords, message.split(",")[:]) stopwords = "".join(brokewords) lis = dict( Counter([ tag.strip() for tag in analyse.extract_tags(string, max_bin) if tag.strip() not in stopwords ])) lis = sorted(lis.items(), key=lambda x: x[1], reverse=True) if method is None: return Geo.cast(lis) elif method == "dict": return {k[0]: k[1] for k in lis} def WordCloudMap(self, message=None): ''' a high-class wordcloud :param message: messages that user gives :return: ''' from wordcloud import WordCloud backgroud_path = './img/{}/background/{}.png'.format( self.tbname, self.tbname) if not os.path.exists(backgroud_path): backgroud_path = './img/sample/1.jpg' backgroud_image = plt.imread(backgroud_path) cloud = WordCloud( width=1024, height=768, font_path='./config/fonts/simhei.ttf', background_color='white', # 设置背景色 mask=backgroud_image, # 词云形状False max_words=100, # 允许最大词汇 max_font_size=400, # 最大号字体 random_state=50 # 旋转角度 ) if message is None: text = self.Cast(name="comment_content", method="dict") else: message.replace(",", ",") if "," not in message: message = message + "," text = self.Cast(name="comment_content", method="dict", message=message) cloud.fit_words(text) # 产生词云 cloud.recolor(color_func=ImageColorGenerator(backgroud_image)) plt.figure() plt.imshow(cloud) plt.axis('off') cloud.to_file(os.path.join(self.path, "wordcloud.png")) def SimpleWordCloudMap(self): ''' a lower-class wordcloud :return: ''' from pyecharts.charts.wordcloud import WordCloud attr, value = self.Cast(name="comment_content") wordcloud = WordCloud(self.title, "数据来源:豆瓣电影", title_pos="center", width=1200, height=600) wordcloud.add("", attr, value, shape="diamond", word_size_range=[20, 100]) if self.saved_file_type is None: wordcloud.render(os.path.join(self.path, "wordcloud.png")) elif self.saved_file_type == "html": wordcloud.render(os.path.join(self.path, "wordcloud.html")) def close(self): self.conn.close()
class TransPS(): def __init__(self): self.cfg = configparser.ConfigParser() self.cfg.read("config.ini") cmdb_db = self.cfg.get("cmdb", "db") cmdb_str = self.cfg.get("cmdb", "conn_str") self.client = MongoClient(cmdb_str) self.db = self.client[cmdb_db] def format_server_name(self, df, col_name): df[col_name] = df[col_name].str.lower().map( lambda x: x.split('.cargosmart.com')[0]) def trans_size_to_mb(self, size_str): result = re.search(r'(?P<size>\d+)\s*(?P<unit>\w*)', size_str).groupdict() g_unit = ['G', 'GB'] if result.get('unit') in g_unit: size_m = str(int(result.get('size')) * 1024) else: size_m = result.get('size') return size_m def get_osvendor(self, osversion): if 'windows' in osversion.lower().replace(' ', ''): osvendor = 'windows' elif 'redhat' in osversion.lower().replace(' ', ''): osvendor = 'redhat' elif 'centos' in osversion.lower().replace(' ', ''): osvendor = 'centos' elif 'esx' in osversion.lower().replace(' ', ''): osvendor = 'esx' else: osvendor = '' return osvendor def format_env(self, in_env_name): if 'PRE' in in_env_name.upper(): env_name = 'PP' elif 'MAINT' in in_env_name.upper(): env_name = 'PM' else: env_name = in_env_name.upper() return env_name def write_to_cmdb(self, coll_name, df): coll = self.db[coll_name] result = coll.delete_many({}) logger.info("%s deleted %s rows" % (coll_name, str(result.deleted_count))) result = coll.insert_many(json.loads(df.to_json(orient='records'))) logger.info("%s inserted %s rows" % (coll_name, str(len(result.inserted_ids)))) def main(self): # server excel_server_coll = self.db['excel_server'] vcenter_server_coll = self.db['vcenter_server'] oem_server_coll = self.db['oem_server'] vcenter_vm_coll = self.db['vcenter_virtualmachine'] excel_server_df = pd.DataFrame(list(excel_server_coll.find())) vcenter_server_df = pd.DataFrame(list(vcenter_server_coll.find())) oem_server_df = pd.DataFrame(list(oem_server_coll.find())) vcenter_vm_df = pd.DataFrame(list(vcenter_vm_coll.find())) self.format_server_name(excel_server_df, 'excel_name') self.format_server_name(vcenter_server_df, 'vc_name') self.format_server_name(vcenter_vm_df, 'vc_name') self.format_server_name(oem_server_df, 'oem_name') # get oem physical server names by (oem servers - vcenter vms) tempdf = pd.merge(oem_server_df, vcenter_vm_df, left_on='oem_name', right_on='vc_name', how='left') oem_ps_names = tempdf.loc[tempdf['vc_name'].isnull(), 'oem_name'] # get all physical server by union all excel, vcenter, oem physical server # names, and distinct excel_ps_names = excel_server_df['excel_name'] vcenter_ps_names = vcenter_server_df['vc_name'] ps_names = concat( [concat([excel_ps_names, vcenter_ps_names]), oem_ps_names]).unique() ps_names_df = pd.DataFrame(ps_names, columns=['ps_name']) join1 = pd.merge(ps_names_df, excel_server_df, left_on='ps_name', right_on='excel_name', how='left') join2 = pd.merge(join1, vcenter_server_df, left_on='ps_name', right_on='vc_name', how='left') ps_df = pd.merge(join2, oem_server_df, left_on='ps_name', right_on='oem_name', how='left').fillna(value='') ps_df['merge_name'] = ps_df['ps_name'] # pdb.set_trace() # delete the columns delete_cols = [ mongoid_col for mongoid_col in ps_df.columns if '_id' in mongoid_col ] delete_cols += ['excel_name', 'vc_name', 'oem_name'] ps_df = ps_df.drop(delete_cols, axis=1) # set cpu num , excel > vc > oem ps_df['merge_cpu_num'] = ps_df['vc_cpu_num'] ps_df.loc[ps_df.merge_cpu_num == '', 'merge_cpu_num'] = ps_df.loc[ps_df.merge_cpu_num == '', 'oem_cpu_num'] ps_df.loc[ps_df.merge_cpu_num == '', 'merge_cpu_num'] = ps_df.loc[ps_df.merge_cpu_num == '', 'excel_cpu_num'] # set cpu type, excel > vc > oem ps_df['vc_cpu_type'] = ps_df['vc_cpu_type'].map( lambda x: re.sub(r'\s\s+', ' ', str(x).upper().split(' @ ')[0].strip())) ps_df['excel_cpu_type'] = ps_df['excel_cpu_type'].map( lambda x: str(x).upper().strip().replace( 'ULTRASPARC IIII', ' ULTRASPARC-IIII').replace( 'INTEL(R) XEON®', 'INTEL(R) XEON(R)').replace( 'INTEL XEON', 'INTEL(R) XEON(R)').replace( 'INTEL® PENTIUM®', 'INTEL(R) PENTIUM(R)')) ps_df['merge_cpu_type'] = ps_df['vc_cpu_type'] ps_df.loc[ps_df.merge_cpu_type == '', 'merge_cpu_type'] = ps_df.loc[ps_df.merge_cpu_type == '', 'excel_cpu_type'] # set cpu core, excel > vc > oem ps_df['merge_cpu_core'] = ps_df['vc_cpu_core'] ps_df.loc[ps_df['merge_cpu_core'] == '', 'merge_cpu_core'] = ps_df.loc[ps_df['merge_cpu_core'] == '', 'oem_cpu_num'] ps_df.loc[ps_df['merge_cpu_core'] == '', 'merge_cpu_core'] = ps_df.loc[ps_df['merge_cpu_core'] == '', 'excel_cpu_core'] # set cpu speed, excel > vc > oem ps_df['merge_cpu_speed'] = ps_df["vc_cpu_speedGHz"] ps_df['merge_cpu_cache_size'] = ps_df['excel_cpu_cache_size'] ps_df['merge_cpu_thread'] = ps_df['vc_cpu_thread'] # set memory size, priority: vc > oem > excel ps_df.loc[ ps_df['excel_memory_size'] != '', 'excel_memory_size'] = ps_df.loc[ ps_df['excel_memory_size'] != '', 'excel_memory_size'].map(lambda x: self.trans_size_to_mb(x)) ps_df['merge_mem_size'] = ps_df["vc_memory_size"] ps_df.loc[ps_df.merge_mem_size == '', 'merge_mem_size'] = ps_df.loc[ps_df.merge_mem_size == '', 'oem_memory_size'] ps_df.loc[ps_df['merge_mem_size'] == '', 'merge_mem_size'] = ps_df.loc[ps_df['merge_mem_size'] == '', 'excel_memory_size'] # set system disk, excel ps_df['merge_system_disk'] = ps_df['excel_system_disk'] ps_df['merge_external_disk'] = ps_df[ 'excel_external_disk'].str.replace('Nil', '') # set brand name, excel ps_df['merge_brand_name'] = ps_df['vc_brand_name'] # set model , excel ps_df['merge_model_name'] = ps_df["vc_model_name"] # pdb.set_trace() # set os version , excel > oem ps_df['merge_osversion_name'] = ps_df["vc_os_version"] ps_df.loc[ps_df['merge_osversion_name'] == '', 'merge_osversion_name'] = ps_df.loc[ ps_df['merge_osversion_name'] == '', 'oem_osversion_name'] # set os vendor ps_df['merge_osvendor'] = ps_df['merge_osversion_name'].map( lambda x: self.get_osvendor(x)) # set environment by excel ps_df['merge_env_purpose'] = ps_df['excel_env_purpose'].map( lambda x: self.format_env(x)) ps_df['merge_environment'] = ps_df['vc_env'] # set others by excel # ps_df['merge_fiber_card2_model'] = ps_df['excel_fiber_card2_model'] # ps_df['merge_fiber_card3_model'] = ps_df[ # 'excel_fiber_card3_model'].str.replace(' ', '') # ps_df['merge_fiber_card_model'] = ps_df[ # 'excel_fiber_card_model'].str.upper().replace('NIL', '') ps_df['merge_fiber_card_model'] = ps_df['vc_fiber_hba_device'] # ps_df['merge_fiber_card_num'] = ps_df[ # 'excel_fiber_card_num'].str.replace(' ', '').fillna('') ps_df['merge_fiber_card_num'] = ps_df['vc_fiber_hba_num'] ps_df['merge_fiber_port'] = ps_df['excel_fiber_port'] ps_df.loc[ps_df['excel_fiber_port'] != '', 'excel_fiber_port'] = ps_df.loc[ ps_df['excel_fiber_port'] != '', 'excel_fiber_port'].map(lambda x: str(int(x))) ps_df['merge_hw_model_eol_date'] = ps_df['excel_hw_model_eol_date'] ps_df['merge_ip'] = ps_df['vc_ip'] ps_df['merge_lan_port'] = ps_df['excel_lan_port'] ps_df.loc[ps_df['excel_lan_port'] != '', 'excel_lan_port'] = ps_df.loc[ ps_df['excel_lan_port'] != '', 'excel_lan_port'].map(lambda x: str(int(x))) ps_df['merge_location'] = ps_df['excel_location'] ps_df['merge_maint_from'] = ps_df['excel_maint_from'].map( lambda x: x.strip()) ps_df['merge_maint_status'] = ps_df['excel_maint_status'] ps_df['merge_maint_to'] = ps_df['excel_maint_to'] ps_df['merge_maint_vendor'] = ps_df['excel_maint_vendor'] ps_df['merge_os_service_pack'] = ps_df['excel_os_service_pack'].map( lambda x: str(x)) ps_df['merge_power_port'] = ps_df['excel_power_port'] ps_df.loc[ps_df['merge_power_port'] != '', 'merge_power_port'] = ps_df.loc[ ps_df['merge_power_port'] != '', 'merge_power_port'].map(lambda x: str(int(x))) ps_df['merge_power_status'] = ps_df['vc_power_status'].str.upper() ps_df['merge_rack_location'] = ps_df['excel_rack_location'].map( lambda x: x.upper().replace(' ', '')) ps_df['merge_serial_num'] = ps_df['excel_serial_num'] ps_df['merge_server_function'] = ps_df['excel_server_function'] ps_df['merge_server_type'] = ps_df['excel_server_type'] ps_df['merge_check_by'] = ps_df['excel_check_by'] ps_df['merge_check_date'] = ps_df['last on-site check date'] merge_cols = [ col.lower() for col in ps_df.columns if 'merge' in col.lower() ] ps_df = ps_df[merge_cols] # write to mongodb self.write_to_cmdb(coll_name='merge_phisical_server', df=ps_df) self.client.close()
class MongoDB(Report): """Stores report in MongoDB.""" order = 9999 # Mongo schema version, used for data migration. SCHEMA_VERSION = "1" def connect(self): """Connects to Mongo database, loads options and set connectors. @raise CuckooReportError: if unable to connect. """ host = self.options.get("host", "127.0.0.1") port = self.options.get("port", 27017) db = self.options.get("db", "cuckoo") try: self.conn = MongoClient( host, port=port, username=self.options.get("username", None), password=self.options.get("password", None), authSource=db) self.db = self.conn[db] except TypeError: raise CuckooReportError("Mongo connection port must be integer") except ConnectionFailure: raise CuckooReportError("Cannot connect to MongoDB") def debug_dict_size(self, dct, parent_key=False): if type(dct) == list: dct = dct[0] if isinstance(dct, str) and parent_key: dct = {parent_key: dct} if not isinstance(dct, str): totals = dict((k, 0) for k in dct) def walk(root, key, val): if isinstance(val, dict): for k, v in val.iteritems(): walk(root, k, v) elif isinstance(val, (list, tuple, set)): for el in val: walk(root, None, el) elif isinstance(val, basestring): totals[root] += len(val) for key, val in dct.iteritems(): walk(key, key, val) return sorted(totals.items(), key=lambda item: item[1], reverse=True) @classmethod def ensure_valid_utf8(cls, obj): """Ensures that all strings are valid UTF-8 encoded, which is required by MongoDB to be able to store the JSON documents. @param obj: analysis results dictionary. """ if not obj: return items = [] if isinstance(obj, dict): items = obj.iteritems() elif isinstance(obj, list): items = enumerate(obj) for k, v in items: # This type check is intentionally not done using isinstance(), # because bson.binary.Binary *is* a subclass of bytes/str, and # we do not want to convert that. if type(v) is str: try: v.decode('utf-8') except UnicodeDecodeError: obj[k] = u''.join(unichr(ord(_)) for _ in v).encode('utf-8') else: cls.ensure_valid_utf8(v) def run(self, results): """Writes report. @param results: analysis results dictionary. @raise CuckooReportError: if fails to connect or write to MongoDB. """ # We put the raise here and not at the import because it would # otherwise trigger even if the module is not enabled in the config. if not HAVE_MONGO: raise CuckooDependencyError("Unable to import pymongo " "(install with `pip install pymongo`)") self.connect() # Set mongo schema version. # TODO: This is not optimal becuase it run each analysis. Need to run # only one time at startup. if "cuckoo_schema" in self.db.collection_names(): if self.db.cuckoo_schema.find_one( )["version"] != self.SCHEMA_VERSION: CuckooReportError( "Mongo schema version not expected, check data migration tool" ) else: self.db.cuckoo_schema.save({"version": self.SCHEMA_VERSION}) # Create a copy of the dictionary. This is done in order to not modify # the original dictionary and possibly compromise the following # reporting modules. report = dict(results) if "network" not in report: report["network"] = {} # Add screenshot paths report["shots"] = [] shots_path = os.path.join(self.analysis_path, "shots") if os.path.exists(shots_path): shots = [ shot for shot in os.listdir(shots_path) if shot.endswith(".jpg") ] for shot_file in sorted(shots): shot_path = os.path.join(self.analysis_path, "shots", shot_file) screenshot = File(shot_path) if screenshot.valid(): # Strip the extension as it's added later # in the Django view report["shots"].append(shot_file.replace(".jpg", "")) # Store chunks of API calls in a different collection and reference # those chunks back in the report. In this way we should defeat the # issue with the oversized reports exceeding MongoDB's boundaries. # Also allows paging of the reports. new_processes = [] for process in report.get("behavior", {}).get("processes", []) or []: new_process = dict(process) chunk = [] chunks_ids = [] # Loop on each process call. for index, call in enumerate(process["calls"]): # If the chunk size is 100 or if the loop is completed then # store the chunk in MongoDB. if len(chunk) == 100: to_insert = {"pid": process["process_id"], "calls": chunk} chunk_id = self.db.calls.insert(to_insert) chunks_ids.append(chunk_id) # Reset the chunk. chunk = [] # Append call to the chunk. chunk.append(call) # Store leftovers. if chunk: to_insert = {"pid": process["process_id"], "calls": chunk} chunk_id = self.db.calls.insert(to_insert) chunks_ids.append(chunk_id) # Add list of chunks. new_process["calls"] = chunks_ids new_processes.append(new_process) # Store the results in the report. report["behavior"] = dict(report.get("behavior", {})) report["behavior"]["processes"] = new_processes # Calculate the mlist_cnt for display if present to reduce db load if "signatures" in results: for entry in results["signatures"]: if entry["name"] == "ie_martian_children": report["mlist_cnt"] = len(entry["data"]) if entry["name"] == "office_martian_children": report["f_mlist_cnt"] = len(entry["data"]) # Other info we want quick access to from the web UI if results.has_key("virustotal") and results["virustotal"] and results[ "virustotal"].has_key( "positives") and results["virustotal"].has_key("total"): report["virustotal_summary"] = "%s/%s" % ( results["virustotal"]["positives"], results["virustotal"]["total"]) if results.get("suricata", False): keywords = ("tls", "alerts", "files", "http", "ssh", "dns") keywords_dict = ("suri_tls_cnt", "suri_alert_cnt", "suri_file_cnt", "suri_http_cnt", "suri_ssh_cnt", "suri_dns_cnt") for keyword, keyword_value in zip(keywords, keywords_dict): if results["suricata"].get(keyword, 0): report[keyword_value] = len(results["suricata"][keyword]) # Create an index based on the info.id dict key. Increases overall scalability # with large amounts of data. # Note: Silently ignores the creation if the index already exists. self.db.analysis.create_index("info.id", background=True) #trick for distributed api if results.get("info", {}).get("options", {}).get("main_task_id", ""): report["info"]["id"] = int( results["info"]["options"]["main_task_id"]) analyses = self.db.analysis.find( {"info.id": int(report["info"]["id"])}) if analyses.count() > 0: log.debug("Deleting analysis data for Task %s" % report["info"]["id"]) for analysis in analyses: log.info(analysis) for process in analysis["behavior"]["processes"]: for call in process["calls"]: self.db.calls.remove({"_id": ObjectId(call)}) self.db.analysis.remove({"_id": ObjectId(analysis["_id"])}) log.debug("Deleted previous MongoDB data for Task %s" % report["info"]["id"]) self.ensure_valid_utf8(report) # Store the report and retrieve its object id. try: self.db.analysis.save(report, check_keys=False) except InvalidDocument as e: parent_key, psize = self.debug_dict_size(report)[0] if not self.options.get("fix_large_docs", False): # Just log the error and problem keys log.error(str(e)) log.error("Largest parent key: %s (%d MB)" % (parent_key, int(psize) / MEGABYTE)) else: # Delete the problem keys and check for more error_saved = True size_filter = MONGOSIZELIMIT while error_saved: if type(report) == list: report = report[0] try: if type(report[parent_key]) == list: for j, parent_dict in enumerate( report[parent_key]): child_key, csize = self.debug_dict_size( parent_dict, parent_key)[0] if csize > size_filter: if parent_key == child_key: log.warn( "results['%s'] deleted due to size: %s" % (parent_key, csize)) del report[parent_key] break else: log.warn( "results['%s']['%s'] deleted due to size: %s" % (parent_key, child_key, csize)) del report[parent_key][j][child_key] else: child_key, csize = self.debug_dict_size( report[parent_key], parent_key)[0] if csize > size_filter: log.warn( "else - results['%s']['%s'] deleted due to size: %s" % (parent_key, child_key, csize)) del report[parent_key][child_key] try: self.db.analysis.save(report, check_keys=False) error_saved = False except InvalidDocument as e: parent_key, psize = self.debug_dict_size(report)[0] log.error(str(e)) log.error("Largest parent key: %s (%d MB)" % (parent_key, int(psize) / MEGABYTE)) size_filter = size_filter - MEGABYTE except Exception as e: log.error("Failed to delete child key: %s" % str(e)) error_saved = False self.conn.close()
def _clear_database(self): client = MongoClient('localhost', 27017) client.drop_database('user') client.close()
class RedditParser(object): def __init__(self, config_data_name, logfile): self.set_config_data(config_data_name) self.subreddits = self.input["subreddits"] self.client_id = self.input["client_id"] self.client_secret = self.input["client_secret"] self.password = self.input["password"] self.username = self.input["username"] self.user_agent = self.input["user_agent"] self.starting_point_date = self.input["starting_point_date"] self.reddit_API = praw.Reddit(client_id=self.client_id, client_secret=self.client_secret, password=self.password, username=self.username, user_agent=self.user_agent) self.logfile = logfile def set_config_data(self, config_data_name): with open(config_data_name) as config_file: self.input = json.load(config_file) def create_connection_db(self): #self.mongo_connection = MongoClient(os.environ['DB_PORT_27017_TCP_ADDR'], 27017) #Use for Docker self.mongo_connection = MongoClient() self.db = self.mongo_connection.reddit_data def close_connection_db(self): self.mongo_connection.close() def create_mongo_collection_and_index(self, collection_name): self.create_connection_db() if collection_name not in self.db.collection_names(): self.db[collection_name].create_index( [("_id", ASCENDING), ("created_date", DESCENDING), ("subreddit", ASCENDING)], name="reddit_items_index", unique=True, dropDups=1) self.close_connection_db() def get_submissions_and_comments(self, collection, testing_purpose): try: start_point_date = datetime.datetime.strptime( self.starting_point_date, "%Y-%m-%d %H:%M:%S") start_unix_time = time.mktime(start_point_date.timetuple()) if testing_purpose == True: end_point_date = start_point_date + timedelta(hours=1) end_unix_time = time.mktime(end_point_date.timetuple()) else: end_unix_time = None reddit = self.reddit_API submissions = [] comments = [] for subreddit_string in self.subreddits: logging.info( "Starting to fetch reddit data for subreddit %s from %s ... " % (subreddit_string, start_unix_time)) subreddit = reddit.subreddit(subreddit_string) subreddit_submissions = [ submission for submission in subreddit.submissions( start=start_unix_time, end=end_unix_time) ] subreddit_comments = [ comment for submission in subreddit_submissions for comment in submission.comments ] self.create_connection_db() for submission in subreddit_submissions: self.db[collection].save( RedditItem(submission.id, submission.title, None, subreddit_string, submission.created_utc).item) for comment in subreddit_comments: self.db[collection].save( RedditItem(comment.id, None, comment.body, subreddit_string, comment.created_utc).item) self.close_connection_db() logging.info( "Successfully update reddit data for subreddit %s from %s ... " % (subreddit_string, start_unix_time)) submissions += subreddit_submissions comments += subreddit_comments logging.info("Succesfully updated reddit data from %s ... " % start_unix_time) return submissions + comments except Exception as ex: logging.error( "Exception ocurred, please find out the exception message : %s" % ex.message)