def merge(mongo_collection=None, drop=True): ## merge docs if mongo_collection: mydisease = mongo_collection else: client = MongoClient() mydisease = client.mydisease.mydisease if drop: mydisease.drop() g = build_id_graph() # make initial primary d with all DOID docs db = MongoClient().mydisease.disease_ontoloy d = [{'_id': doc['_id'], 'disease_ontology': doc} for doc in db.find()] mydisease.insert_many(d) # fill in from other sources for db_name in tqdm(set(db_names) - {'disease_ontoloy'}): print(db_name) db = MongoClient().mydisease[db_name] if db.count() == 0: print("Warning: {} is empty".format(db)) for doc in db.find(): doids = get_equiv_doid(g, doc['_id']) for doid in doids: mydisease.update_one({'_id': doid}, {'$push': {db_name: doc}}, upsert=True)
def __init__(self, page, username): posts = MongoClient().blog.Aritical.find({ 'username': username }).sort('issuing_time', DESCENDING) self.total = posts.count() self.pages = int(self.total / 20) if self.total % 20 != 0: self.pages += 1 if page == 1: self.has_prev = False else: self.has_prev = True if page == self.pages: self.has_next = False else: self.has_next = True self.next_num = page + 1 self.page = page self.per_page = 20 self.prev_num = page - 1 self.current_num = self.total - (20 * (page - 1)) if self.current_num > 20: self.current_num = 20 self.item = [] for i in range(self.current_num): self.item.append(posts[self.prev_num * 20 + i])
def run(host=None, db=None, coll=None, node=None, outgoing="true", incoming="true", undirected="true"): # Connect to the mongo collection. graph = MongoClient(host)[db][coll] outgoing = json.loads(outgoing) incoming = json.loads(incoming) undirected = json.loads(undirected) # Construct the query according to the given options. query = {"type": "link"} clauses = [] oid = ObjectId(node) if outgoing or incoming: dirclauses = [] orclause = {"$or": [{"undirected": {"$not": {"$exists": 1}}}, {"undirected": False}]} if outgoing: dirclauses.append({"source": oid}) if incoming: dirclauses.append({"target": oid}) clauses.append({"$and": [orclause, {"$or": dirclauses}]}) if undirected: clauses.append({"$and": [{"undirected": True}, {"$or": [{"source": oid}, {"target": oid}]}]}) query["$or"] = clauses return json.dumps(graph.count(query))
class MongoIterator(object): def __init__(self, uri, db, collection, skip=0, limit=0, filter=None): self._collection = MongoClient(uri)[db][collection] self._skip = skip self._limit = limit self._filter = filter def __iter__(self): return self.stream() def stream(self, conditions=None, projection=None, skip=None, limit=None): proj = {k: 1 for k in projection} if projection else {} if proj: proj.update({'_id': False}) # skip internal id return self._collection.find(conditions or self._filter, proj or None, skip=skip or self._skip, limit=limit or self._limit) def size(self): return self._collection.count() if not self._filter else self._collection.find(self._filter).count() @property def filter(self): return self._filter @filter.setter def filter(self, conditions): self._filter = conditions
def upload(source_json, source, db_name=DB_NAME, coll_name=VERBS, drop=False, indices=(VERB, PARADIGM)): target = MongoClient(LOCALHOST, PORT)[db_name][coll_name] if drop: target.drop() print('Initially,', target.count(), 'entries') count = counter() for line in read_json_lines(source_json): next(count) line[SOURCE] = source target.insert(line) add_indices(target, indices) print('\nCurrently,', target.count(), 'entries')
class TvrainData: def __init__(self): """ Just load data from Mongo. """ self.sequences = MongoClient( os.environ['MONGODB_URL']).tvrain.sequences self.collection = MongoClient( os.environ['MONGODB_URL']).tvrain.articles self.collection.create_index("time") def get_random_articles(self, n): """Returns N of topics for index.html""" articles = self.collection.find().sort("time", 1).skip( random.randint(0, self.collection.count())).limit(n) return list(articles) def get_article_id(self, url): """Get id by url""" return self.collection.find_one({'url': url})['_id'] def get_articles_data(self, articles_urls): """ Get data from MongoDB for articles urls :param articles_urls: ['article_url', ...] :return: list of MongoDB documents """ articles = [] for url in articles_urls: articles.append(self.collection.find_one({'url': url})) return articles def iterate_articles(self, except_articles, skip=0, limit=None, query=None): """ Iteate throw all articles without ids of except articles :param except_articles: list of ids :return: """ if query is None: query = {} if limit is None: data = self.collection.find(query).skip(skip) else: data = self.collection.find(query).skip(skip).limit(limit) for value in data: if value['_id'] not in except_articles: yield value def get_sequences(self): """Return all sequences for train""" return list(self.sequences.find().limit(-1))
def merge_one(db_name): mydisease = MongoClient().mydisease.mydisease g = build_id_graph() db = MongoClient().mydisease[db_name] if db.count() == 0: print("Warning: {} is empty".format(db)) for doc in db.find(): doids = get_equiv_doid(g, doc['_id']) for doid in doids: mydisease.update_one({'_id': doid}, {'$push': {db_name: doc}}, upsert=True)
class BigramsCorpus: def __init__(self, db, collection): self.client = MongoClient()[db][collection] def __iter__(self): for doc in self.client.find(): yield [doc['_id']] def __len__(self): return self.client.count()
def __init__(self, page, show_follow): if show_follow == 0: posts = MongoClient().blog.Aritical.find().sort( 'issuing_time', DESCENDING) self.total = posts.count() self.posts = posts if show_follow == 1: self.posts = [] following = MongoClient().blog.User.find_one({ 'username': current_user.username }).get('following') artical = MongoClient().blog.Aritical.find().sort( 'issuing_time', DESCENDING) # following.append([current_user.username, 'date']) for i in range(following.__len__()): for x in range(artical.count()): if following[i][0] == artical[x].get('username'): self.posts.append(artical[x]) self.posts.sort(key=lambda x: x.get('issuing_time'), reverse=True) self.total = self.posts.__len__() self.pages = int(self.total / 20) if self.total % 20 != 0: self.pages += 1 if page == 1: self.has_prev = False else: self.has_prev = True if page == self.pages: self.has_next = False else: self.has_next = True self.next_num = page + 1 self.page = page self.per_page = 20 self.prev_num = page - 1 self.current_num = self.total - (20 * (page - 1)) if self.current_num > 20: self.current_num = 20 self.item = [] for i in range(self.current_num): self.item.append(self.posts[self.prev_num * 20 + i])
def run(host=None, db=None, coll=None, node=None, outgoing="true", incoming="true", undirected="true"): # Connect to the mongo collection. graph = MongoClient(host)[db][coll] outgoing = json.loads(outgoing) incoming = json.loads(incoming) undirected = json.loads(undirected) # Construct the query according to the given options. query = {"type": "link"} clauses = [] oid = ObjectId(node) if outgoing or incoming: dirclauses = [] orclause = { "$or": [{ "undirected": { "$not": { "$exists": 1 } } }, { "undirected": False }] } if outgoing: dirclauses.append({"source": oid}) if incoming: dirclauses.append({"target": oid}) clauses.append({"$and": [orclause, {"$or": dirclauses}]}) if undirected: clauses.append({ "$and": [{ "undirected": True }, { "$or": [{ "source": oid }, { "target": oid }] }] }) query["$or"] = clauses return json.dumps(graph.count(query))
class Stat: def __init__(self, config_file): self.docker = dockerGuest(config_file) self.config_file = config_file self.collection = MongoClient(os.environ['DB_PORT_27017_TCP_ADDR'], 27017)['test'][self.get_collection()] def get_collection(self): dir_path = os.path.dirname(os.path.realpath(__file__)) file_path = os.path.join(dir_path, self.config_file) return json.load(open(file_path))["collection"] def get_config_file(self): return self.config_file @staticmethod def get_time(): time = datetime.now() return time #return time.strftime('%H:%M:%S') def data_to_save(self): stats = self.docker.get_stats() if stats == None: return None global_stat_dict = {} global_stat_dict["time"] = Stat.get_time() global_stat_dict["stats"] = stats return global_stat_dict def save(self, cap=60): new_data = self.data_to_save() if self.collection.count() == cap: top_doc_time = min(doc['time'] for doc in self.collection.find()) self.collection.delete_one({'time': top_doc_time}) self.collection.insert_one(new_data) logger.info("Saved in DB...") def save_to_db(self): data = self.data_to_save() if data != None: if self.is_db_full(): self.make_space_db() logger.info('DB Save') self.collection.insert_one(data) def make_space_db(self): logger.info('Making space') self.collection.delete_one({'_id': self.collection.find()[0]['_id']}) def is_db_full(self): if self.collection.find({}).count() == 60: return True return False
class GraphData(object): _instance = None _instance_lock = Lock() host = '101.132.40.25' port = 27017 def __init__(self): self.table = MongoClient(host=GraphData.host, port=GraphData.port).get_database( 'judging').get_collection('graph') def __new__(cls, *args, **kwargs): """ singleton > Multiple `GraphData()` usage will return the same instance :param args: :param kwargs: :return: """ if GraphData._instance is None: with GraphData._instance_lock: if GraphData._instance is None: GraphData._instance = object.__new__(cls) return GraphData._instance def exists(self, graph_name: str) -> bool: return self.table.count({'_id': graph_name}) > 0 def save(self, graph: dict) -> bool: """ **Attention: this method will override the exist graph** :param graph: :return: """ self.table.save({**{'_id': graph['名称']}, **graph}) return True def fetch(self, graph_name: str) -> dict: return self.table.find_one({'_id': graph_name}) def get_graph_list(self) -> List[str]: graph_list = [] for i in self.table.find({}, {'名称': 1, '_id': 0}): graph_list.append(i['名称']) return graph_list def remove_graph(self, graph_name: str): self.table.remove(graph_name)
class TvrainData: def __init__(self): """ Just load data from Mongo. """ self.sequences = MongoClient(os.environ['MONGODB_URL']).tvrain.sequences self.collection = MongoClient(os.environ['MONGODB_URL']).tvrain.articles self.collection.create_index("time") def get_random_articles(self, n): """Returns N of topics for index.html""" articles = self.collection.find().sort("time", 1).skip(random.randint(0, self.collection.count())).limit(n) return list(articles) def get_article_id(self, url): """Get id by url""" return self.collection.find_one({'url': url})['_id'] def get_articles_data(self, articles_urls): """ Get data from MongoDB for articles urls :param articles_urls: ['article_url', ...] :return: list of MongoDB documents """ articles = [] for url in articles_urls: articles.append(self.collection.find_one({'url': url})) return articles def iterate_articles(self, except_articles, skip=0, limit=None, query=None): """ Iteate throw all articles without ids of except articles :param except_articles: list of ids :return: """ if query is None: query = {} if limit is None: data = self.collection.find(query).skip(skip) else: data = self.collection.find(query).skip(skip).limit(limit) for value in data: if value['_id'] not in except_articles: yield value def get_sequences(self): """Return all sequences for train""" return list(self.sequences.find().limit(-1))
def remove_morphologically_abnormal_verbs(): abnormal_count = 0 coll = MongoClient(LOCALHOST, PORT)[DB_NAME][VERBS] count = counter(coll.count()) for entry in coll.find(): next(count) verbs = entry[VERB] if isinstance(verbs, str): verbs = [verbs] for verb in verbs: if not (verb.endswith('ω') or verb.endswith('ώ') or verb.endswith('αι')): coll.delete_one({VERB: verb}) abnormal_count += 1 print("\nRemoved {} abnormal verbs".format(abnormal_count))
def setUpDb(self, host, port, db, collection): try: mongo_host = os.environ.get( host, os.environ.get("MONGO_HOST", "localhost")) mongo_port = os.environ.get(port, 27017) mongo_database = os.environ.get( db, "twitter_database") client = MongoClient(mongo_host, mongo_port)[ mongo_database][collection] if collection == "twitter_collection-"+self.owner and client.count() == 0: raise Exception( "There is no data in the source database: " + collection) return client except Exception as err: print("Error when connecting to SOURCE database: " + str(err)) exit(2)
def print_verbs(fieldname, fltr, func=None, dbname=DB_NAME, collname=VERBS): match = MongoClient(LOCALHOST, PORT)[dbname][collname].find(fltr) total = match.count() if func: res = list() count = counter(total) for entry in match: next(count) if func(entry): res.append(entry[fieldname]) print("\n{} matching items".format(len(res))) else: print(total, "matching entries") res = [entry[fieldname] for entry in match] for item in res: print(item)
class TextIO: def __init__(self): self.db = MongoClient('localhost', 20000).get_database('chinese').get_collection('train') def get_mongo_size(self): size = self.db.count() # print("size: %d" % size) return size def get_text_from_mongo(self, skip=0, limit=1, isRandom=True): size = self.get_mongo_size() if isRandom: skip = random.randint(0, size - limit) cursor = self.db.find().skip(skip).limit(limit) for doc in cursor: yield doc['text']
def collect_duplicates(): visited = set() duplicates = set() coll = MongoClient(LOCALHOST, PORT)[DB_NAME][VERBS] count = counter(coll.count()) for entry in coll.find(): next(count) verbs = entry[VERB] if isinstance(verbs, str): verbs = [verbs] for verb in verbs: if verb in visited: duplicates.add(verb) else: visited.add(verb) print("\nDumping {} duplicates".format(len(duplicates))) dump_utf_json(sorted(list(duplicates)), DUPLICATES_JSON)
class Mongodb: def __init__(self): self.collection = MongoClient()['db_name']['collection_name'] def count(self): return self.collection.count() def find_page(self, pager, query=None): if pager.is_pre_half: result = list( self.collection.find(query).skip(pager.offset).limit( pager.page_size)) else: result = list(self.collection.find(query) \ .sort([('_id', -1)]).skip(0 if pager.is_last else pager.residue) \ .limit(min(pager.page_size, pager.residue if pager.is_last else pager.page_size)))[::-1] return result
def status(client, db, cell, example): ''' \b - list cells, num_entries - verbose: find_one() in each cell but truncate sequence field before print - include .zoo metadata in the report in the future Example: \b zoo status --db diff --cell mock --example ''' c = MongoClient(client)[db][cell] print(c.count(), 'documents.\n') if example: print('Example:') print(json.dumps(c.find_one(), indent=2)) print()
def main(): parser = argparse.ArgumentParser() parser.add_argument("twitterUser", type=str, help="Twitter ID") parser.add_argument("-l", "--limit", type=int, help="Limit of tweet that have to be scraped, tweet are retrieved in batches of 20, default: 20", default=20) args = parser.parse_args() mongo_host = os.environ.get("MONGO_HOST", "localhost") mongo_port = os.environ.get("MONGO_PORT", 27017) mongo_database = os.environ.get( "MONGO_TWITTER_DATABASE", "twitter_database") mongo_collection = "twitter_collection-" + args.twitterUser client = MongoClient(mongo_host, mongo_port)[ mongo_database][mongo_collection] fetch({ "twitterUser": args.twitterUser, "limit": args.limit, "mongoClient": client }) # Possibly return the collection of tweet in python format readable print("Number of tweets inserted in " + mongo_collection + ": " + str(client.count()))
class MongoKVStorage(KVStorage): def __init__(self, config): super().__init__(config) mongo_host = config['host'] mongo_port = config['port'] mongo_db_name = config['db'] mongo_collection = config['collection'] self._collection = MongoClient(host=mongo_host, port=mongo_port)[mongo_db_name][mongo_collection] def get(self, key): found_val = self._collection.find_one({'key': key}) return found_val def set(self, key, value): self._collection.update_one({'key': key}, {'$set': value}, upsert=True) def exists(self, key): return self._collection.count({'key': key}) > 0
def main(): model = Doc2Vec.load(Settings.MODEL_PATH + "doc2vec.model") """test_similarity([("right", "wrong"), ("refresh", "cache")], model)""" reports_collection = MongoClient(Settings.MONGO_CONNECTION_STRING)[ Settings.DOC2VEC_REPORTS_DATABASE][Settings.REPORTS_COLLECTION] duplicate_reports_collection = MongoClient( Settings.MONGO_CONNECTION_STRING)[Settings.DOC2VEC_REPORTS_DATABASE][ Settings.DUPLICATE_REPORTS_COLLECTION] # get random target text that has a duplciate index = random.randint(0, duplicate_reports_collection.count() - 1) txt = duplicate_reports_collection.find_one({"reId": index})["dups"][0] txt = normalize_text(txt) print "Target Text: %s" % txt vec = model.infer_vector(txt) # print most similar documents for reId, p in model.docvecs.most_similar([vec], topn=10): print "%0.3f: \"%s\"" % (p, reports_collection.find_one({"reId": reId })["text"])
import json, sys import progressbar from collections import defaultdict from pymongo import MongoClient from gazouilloire.web.export import format_csv with open('config.json') as confile: conf = json.loads(confile.read()) db = MongoClient(conf['mongo']['host'], conf['mongo']['port'])[conf['mongo']['db']]['tweets'] langs = defaultdict(int) query = {} print "Counting matching results..." count = db.count(query) print "Querying and hashing results..." bar = progressbar.ProgressBar(max_value=count) for t in bar(db.find(query, limit=count, projection={"lang": 1, "_id": 0})): l = t.get("lang", "") langs[l] += 1 print "Sorting and storing csv data..." with open("langs.csv", "w") as f: print >> f, "langs,count" bar = progressbar.ProgressBar(max_value=len(langs)) for l, ct in bar(sorted(langs.items(), key=lambda x: -x[1])): print >> f, '%s,%s' % (l, ct)
if len(sys.argv) == 2: if '{' in sys.argv[1]: try: query = eval(sys.argv[1]) if only_selected: query = {"$and": [query, {SELECTED_FIELD: True}]} except Exception as e: sys.stderr.write("WARNING: query wrongly formatted: %s\n" % sys.argv[1]) sys.exit("%s: %s\n" % (type(e), e)) elif os.path.exists(sys.argv[1]): with open(sys.argv[1]) as f: ids = sorted([t.get("id", t.get("_id")) for t in csv.DictReader(f)]) if include_threads: ids = get_thread_ids_from_ids(ids, mongodb) query = {"_id": {"$in": ids}} else: query["text"] = re.compile(sys.argv[1].replace(' ', '\s+'), re.I) elif len(sys.argv) > 2: query["$or"] = [] for arg in sys.argv[1:]: query["$or"].append({"text": re.compile(arg.replace(' ', '\s+'), re.I)}) count = mongodb.count(query) iterator = yield_csv(mongodb.find(query, sort=[("timestamp", 1)], limit=count), extra_fields=EXTRA_FIELDS) if verbose: import progressbar bar = progressbar.ProgressBar(max_value=count) iterator = bar(iterator) for t in iterator: print t
parser.add_argument('--fastrun', dest='fastrun', action='store_true') parser.add_argument('--no-fastrun', dest='fastrun', action='store_false') parser.set_defaults(fastrun=True) args = parser.parse_args() log_dir = args.log_dir if args.log_dir else "./logs" run_id = datetime.now().strftime('%Y%m%d_%H:%M') __metadata__['run_id'] = run_id taxon = args.taxon fast_run = args.fastrun coll = MongoClient(args.mongo_uri)[args.mongo_db]["mygene"] # get metadata about sources # this should be stored in the same db under the collection: mygene_sources metadata_coll = MongoClient( args.mongo_uri)[args.mongo_db]["mygene_sources"] assert metadata_coll.count() == 1 metadata = metadata_coll.find_one() log_name = '{}-{}.log'.format(__metadata__['name'], run_id) if wdi_core.WDItemEngine.logger is not None: wdi_core.WDItemEngine.logger.handles = [] wdi_core.WDItemEngine.setup_logging(log_dir=log_dir, log_name=log_name, header=json.dumps(__metadata__), logger_name='gene{}'.format(taxon)) if "microbe" in taxon: microbe_taxa = get_all_taxa() taxon = taxon.replace("microbe", ','.join(map(str, microbe_taxa))) for taxon1 in taxon.split(","):
from __future__ import division from pymongo import MongoClient from settings import Settings import operator #Intialize all collection userscore_collection = MongoClient(Settings.MONGO_CONNECTION_STRING)[Settings.TOPICS_DATABASE][Settings.USERSCORE_COLLECTION] businessscore_collection = MongoClient(Settings.MONGO_CONNECTION_STRING)[Settings.TOPICS_DATABASE][Settings.BUSINESSSCORE_COLLECTION] reco_collection = MongoClient(Settings.MONGO_CONNECTION_STRING)[Settings.TOPICS_DATABASE][Settings.RECOMMENDATION_COLLECTION] # Go through each user and compute top 20 Business for each User print userscore_collection.count() print businessscore_collection.count() userScoreCollection = userscore_collection.find() bulk = reco_collection.initialize_unordered_bulk_op() counter=0 bulkCounter = 0 for user in userScoreCollection: userTopics = user["userscore"] #print "length of user topics " + str(len(userTopics)) ratings= { } businessScoreCollection = businessscore_collection.find() for business in businessScoreCollection: businessTopics = business["businessscore"] #print "length of Business topics " + str(len(businessTopics))
type=str) parser.add_argument('protdbcoll_name', help='MongoDB ProtDB Collection name', type=str) parser.add_argument('--host', help='MongoDB host (mongod or mongos)', type=str) parser.add_argument('--port', help='MongoDB port (mongod or mongos)', type=int) args = parser.parse_args() if args.host: host = args.host else: host = 'localhost' if args.port: port = args.port else: port = 27017 ProtColl = MongoClient(host, port)[args.protdb_name][args.protdbcoll_name] half = int(ProtColl.count() / 2) #half = 82817736 # in indexDB / ComPIL f = sys.stdin for protID in f: protID = int(protID) print(protID) if protID <= half: print(protID + half) elif protID > half: print(protID - half)
import json from pymongo import MongoClient try: with open(os.path.join(os.path.dirname(__file__), '..', 'config.json')) as confile: conf = json.loads(confile.read()) except Exception as e: sys.stderr.write("ERROR: Impossible to read config.json: %s %s\n" % (type(e), e)) exit(1) try: db = MongoClient(conf['mongo']['host'], conf['mongo']['port'])[conf['mongo']['db']]['links'] except Exception as e: sys.stderr.write("ERROR: Could not initiate connection to MongoDB: %s %s\n" % (type(e), e)) exit(1) verbose = True if len(sys.argv) > 1 and "--quiet" in sys.argv: sys.argv.remove("--quiet") verbose = False count = db.count() iterator = db.find() if verbose: import progressbar bar = progressbar.ProgressBar(max_value=count) iterator = bar(iterator) print "url\tresolved" for t in iterator: print ('%s\t%s' % (t["_id"], t["real"])).encode('utf-8')
def get_data(table="cmnt"): ''' table: table (collection) ''' limit = request.args.get("limit", 10, type=int) page = request.args.get("page", 1, type=int) _db = MongoClient().safe_protocol[table] # data = _db.find().sort("time", -1).skip(limit * (page - 1)).limit(limit) if table == "alert": data_list = [] alerts = _db.find().sort("time", -1).skip(limit * (page - 1)).limit(limit) total = _db.count() for alert in alerts: alert = { 'time': alert.get('time'), 'protocol_type': alert.get('type'), 'message': alert.get('message') } data_list.append(alert) protocol_type = request.args.get("type") if protocol_type: data_list = list( filter(lambda x: x['protocol_type'] == protocol_type, data_list)) total = len(data_list) return {'data': data_list, 'total': total} elif table == "user": data_list = [] users = _db.find().sort("create_time", -1).skip(limit * (page - 1)).limit(limit) total = _db.count() for user in users: data_list.append({ 'user_id': user.get('_id'), 'username': user.get('name'), 'level': user.get('level'), 'create_time': user.get('create_time') }) return {'data': data_list, 'total': total} elif table == "oper": data_list = [] opers = _db.find().sort("time", -1).skip(limit * (page - 1)).limit(limit) total = _db.count() for oper in opers: oper = { 'user_id': oper.get('user_id'), 'username': oper.get('user_name'), 'time': oper.get('time'), 'protocol_type': oper.get('protocol_type'), 'oper': oper.get('oper') } data_list.append(oper) return {'data': data_list, 'total': total} elif table == "cmnt": data_list = [] cmnts = _db.find().sort("time", -1).skip(limit * (page - 1)).limit(limit) total = _db.count() for cmnt in cmnts: cmnt = { 'time': cmnt.get('time'), 'buffer': cmnt.get('buffer'), 'ip': cmnt.get('ip') } data_list.append(cmnt) return {'data': data_list, 'total': total}
class Stat: ''' This class is used for 1. Putting stat_list corresponsing to a specific timestamp 2. Saving in mongoDB ''' def __init__(self,config_file): self.docker = dockerGuest(config_file) self.config_file = config_file self.collection = MongoClient()['test'][self.get_collection()] # connecting to mongodb def get_collection(self): ''' Getting mongodb collection(table) name from config file (This method might be removed later... ''' dir_path = os.path.dirname(os.path.realpath(__file__)) file_path = os.path.join(dir_path,self.config_file) return json.load(open(file_path))["collection"] def get_config_file(self): return self.config_file @staticmethod def get_time(): '''Getting current timestamp Storing python datetime object in mongodb. This might be changed later. ''' time = datetime.now() return time def data_to_save(self): ''' Formats data to be saved in mongodb. JSON: { time: timestamp, stats: [{container1 stat },{ container2 stat } ...] } ''' stats = self.docker.get_stats() if stats == None: # if no stat that is there are no containers return None return None global_stat_dict ={} global_stat_dict["time"]=Stat.get_time() global_stat_dict["stats"] = stats return global_stat_dict def save(self,cap=60): ''' Saving in DB''' new_doc = self.data_to_save() if self.collection.count() == cap: ''' If there are 60 items in db we delete the oldest timestamp data from db and insert the new item. We cannot simply delete an item because insertion in mongodb collection is found out to be random. ''' top_doc_time = min(doc['time'] for doc in self.collection.find()) #oldest timestamp. Simple if datetime objects are stored. self.collection.delete_one({'time':top_doc_time}) #delete oldest timestamp logger.info("Deleted timestamp is...{}".format(top_doc_time)) self.collection.insert_one(new_doc) #insert new data logger.info("Saved in DB...{}".format(new_doc["time"])) def save_data(self): ''' method not used ''' data = self.data_to_save() if data!=None: if self.is_db_full(): self.make_space_db() self.collection.insert_one(data) logger.info('Saved in DB...')
import unittest.mock
'config.json')) as confile: conf = json.loads(confile.read()) except Exception as e: sys.stderr.write("ERROR: Impossible to read config.json: %s %s\n" % (type(e), e)) exit(1) try: db = MongoClient(conf['mongo']['host'], conf['mongo']['port'])[conf['mongo']['db']]['links'] except Exception as e: sys.stderr.write( "ERROR: Could not initiate connection to MongoDB: %s %s\n" % (type(e), e)) exit(1) verbose = True if len(sys.argv) > 1 and "--quiet" in sys.argv: sys.argv.remove("--quiet") verbose = False count = db.count() iterator = db.find() if verbose: import progressbar bar = progressbar.ProgressBar(max_value=count) iterator = bar(iterator) print "url\tresolved" for t in iterator: print('%s\t%s' % (t["_id"], t["real"])).encode('utf-8')
class MongodbUtil(object): """ - .bashrc 또는 .bashprofile 에 MYSQL_PASSWD 를 설정해야 함. """ def __init__(self, mongo_url, db_name, collection_name, auto_connect=False): """ :param mongo_url: host, port, username, password, auth db :param db_name: database name :param collection_name: collection name :param auto_connect: default do not connect for multiprocessing (http://api.mongodb.com/python/current/faq.html#using-pymongo-with-multiprocessing) """ self.mongo_url = mongo_url self.db_name = db_name self.collection_name = collection_name self.auto_connect = auto_connect self.collection = MongoClient(mongo_url, socketKeepAlive=True, connect=auto_connect)[db_name][collection_name] def __repr__(self): return '%s (db_name:%s, collection_name:%s, auto_connect:%s)' % ( StringUtil.mask_passwd_in_url(self.mongo_url), self.db_name, self.collection_name, self.auto_connect) def __str__(self): return self.__repr__() def find(self, query=None, sort=None, limit=0): if query is None: query = {} if sort is None: sort = [('_id', ASCENDING)] for row in self.collection.find(query, no_cursor_timeout=True).sort(sort).limit(limit): yield row def count(self, query=None): if query is None: query = {} return self.collection.count(query, no_cursor_timeout=True) def find_one(self, query: dict, limit=0) -> dict: return self.collection.find_one(query, no_cursor_timeout=True).limit(limit) def create_index(self, field_list=None, unique=False): if field_list is None: field_list = [] for field in field_list: self.collection.create_index([(field, ASCENDING)], background=True, unique=unique) return def insert(self, row: dict): return self.collection.insert_one(row) def update_one(self, where_query: dict, update_content: dict, upsert=False): return self.collection.update_one( where_query, update_content, upsert=upsert ) def update(self, where_query: dict, update_content: dict, upsert=False): return self.collection.update_many( where_query, update_content, upsert=upsert ) def save(self, row): return self.collection.save(row) def delete(self, where_query: dict): result = self.collection.delete_one(where_query) if result: return result.deleted_count return 0 def drop(self): return self.collection.drop()
reviews = corpus_collection.find({'business_id': rest['_id']}) if reviews.count() > 9: print i list.append((i,reviews.count())) f.write (str(i)) f.write('\t') f.write(str(reviews.count())) f.write('\n') print len(list) ''' a = [4.5,4.3,5] b = [3.1,4.3,5] rest_rating = [] print 'Cor: ' pr = pearsonr(a,b)[0] pr = round(pr*100,2) print '%r %%'% pr print restaurant_cursor.count() print corpus_collection.count() for i in range(2): print i rest =restaurant_cursor.__getitem__(i) reviews = corpus_collection.find({'business_id': rest['_id']}) #print (reviews.count()) #print ("Restaurant : %s" % rest['name']) #print ("Restaurant stars: %s" % rest['stars']) rest_rating.append(rest['stars']) print (rest_rating)
import json, sys import progressbar from pymongo import MongoClient from gazouilloire.web.export import format_csv with open('config.json') as confile: conf = json.loads(confile.read()) db = MongoClient(conf['mongo']['host'], conf['mongo']['port'])[conf['mongo']['db']]['tweets'] urls = {} query = {} #query["langs"] = "fr" print "Counting matching results..." count = db.count(query) print "Querying and hashing results..." bar = progressbar.ProgressBar(max_value=count) for t in bar(db.find(query, limit=count, projection={"links": 1, "proper_links": 1})): for l in t.get("proper_links", t["links"]): if l not in urls: urls[l] = 0 urls[l] += 1 print "Sorting and storing csv data..." with open("shared_urls.csv", "w") as f: print >> f, "url,shares" bar = progressbar.ProgressBar(max_value=len(urls)) for link, shares in bar(sorted(urls.items(), key = lambda x: -x[1])): print >> f, '%s,%s' % (format_csv(link), shares)
elif os.path.exists(sys.argv[1]): with open(sys.argv[1]) as f: ids = sorted( [t.get("id", t.get("_id")) for t in csv.DictReader(f)]) if include_threads: ids = get_thread_ids_from_ids(ids, mongodb) query = {"_id": {"$in": ids}} else: query["text"] = re.compile(sys.argv[1].replace(' ', '\s+'), re.I) elif len(sys.argv) > 2: query["$or"] = [] for arg in sys.argv[1:]: query["$or"].append( {"text": re.compile(arg.replace(' ', '\s+'), re.I)}) if limit: total = limit elif count and verbose: total = mongodb.count(query) else: total = mongodb.count() iterator = yield_csv(mongodb.find(query, sort=[("timestamp", 1)], limit=total), extra_fields=EXTRA_FIELDS) if verbose: import progressbar bar = progressbar.ProgressBar(max_value=total) iterator = bar(iterator) for t in iterator: print t
with open(os.path.join(os.path.dirname(__file__), '..', 'config.json')) as confile: conf = json.loads(confile.read()) except Exception as e: sys.stderr.write("ERROR: Impossible to read config.json: %s %s" % (type(e), e)) exit(1) try: mongodb = MongoClient(conf['mongo']['host'], conf['mongo']['port'])[conf['mongo']['db']]['tweets'] except Exception as e: sys.stderr.write("ERROR: Could not initiate connection to MongoDB: %s %s" % (type(e), e)) exit(1) verbose = True if len(sys.argv) > 1 and "--quiet" in sys.argv: sys.argv.remove("--quiet") verbose = False print "id" iterator = mongodb.find(projection=["_id"]) if verbose: import progressbar count = mongodb.count() bar = progressbar.ProgressBar(max_value=count) iterator = bar(iterator) for t in iterator: print t["_id"]
from pymongo import MongoClient import time while True: client = MongoClient()['spider']['1024'] print(client.count()) time.sleep(2)
import os import json from random import randint from flask import Flask, jsonify, request from fighter import Arena, Fighter from pymongo import MongoClient app = Flask(__name__) arena = Arena(8) repos = MongoClient().githunt.repos repos_count = repos.count() fights = MongoClient().gitfighter.fights def get_full_name(repo): return '%s/%s' % (repo['owner']['login'], repo['name']) @app.route("/arena") def arena_json(): print(arena.json) return jsonify(arena.json) @app.route("/fight", methods=['POST']) def fight(): fighters_stats = request.json print fighters_stats arena.set_fighters(fighters_stats) arena.start() fight = {"log": arena.log} response = jsonify(fight)