class MongoCorpus(SimpleCorpus): """ Corpus wrapper around a MongoDB collection. Subset corpus by setting a query. If "aggregate" is used, this will override "query". In this case use "$match" in aggregation method. """ def __init__(self, db, collection, aggregate=[], query={}): self.client = MongoClient()[db][collection] self.aggregate_arg = aggregate self.find_arg = query def __iter__(self): """ _obj_ is a dictionary: you can filter the right key to feed only docs text. """ collection = self.client.find(self.find_arg, no_cursor_timeout=True) \ if len(self.aggregate_arg) == 0 \ else self.client.aggregate(self.aggregate_arg) for doc in collection: yield doc collection.close() def __len__(self): if len(self.aggregate_arg) == 0: return self.client.find(self.find_arg).count() else: d = next(self.client.aggregate(self.aggregate_arg + [{"$group": {"_id": "null", "count": {"$sum": 1}}}])) return d['count']
class RemoteIO: def __init__(self): time_counter(print_to_console=False) print("初始化 RemoteIO") self.db = MongoClient('192.168.68.11', 20000).get_database( "tokenizer_qiao").get_collection('splited_sentences') self.sentence_size = self.db.find().count() self.step = self.sentence_size self.skip = 0 time_counter("初始化完毕") def read_sentence_randomly(self): while self.skip + self.step >= self.sentence_size: print("skip:%d, step:%d, size:%d" % (self.skip, self.step, self.sentence_size)) if self.step == 0: return None self.skip = 0 self.step = int(self.step / 2) if self.step + self.skip < self.sentence_size: random_step = random.randint(0, self.step) # print("获取 skip:%d" % self.skip+random_step) pipeline = [{"$skip": self.skip + random_step}, {"$limit": 1}] self.skip += random_step docs = list(self.db.aggregate(pipeline)) doc = docs[0] if len(docs) > 0 else None self.db.update({"_id": doc["_id"]}, {"$inc": {"analysed": 1}}) time_counter("已获取到") return doc else: return None def read_sentence_from_remote(self): db = self.db return db.find()
class Count(object): """ code field count """ def __init__(self, code, year=None, type=None, factor=None): self.code = code self.year = year self.type = type self.factor = factor self.collection = MongoClient()["db5"]["values"] pass def aggregation(self): query = [{ "$match": { "key": self.code, "year": self.year, "type": self.type } }, { "$group": { "_id": { "code": "$key", "year": "$year" }, "total": { "$sum": 1 } } }] result = self.collection.aggregate(pipeline=query) return result def find(self): pass
class MongoDbCardStorage(BaseCardStorage): def __init__(self, database, **kwargs): self._cards = MongoClient(**kwargs)[database].cards def sample(self, k): return list( self._cards.aggregate([{ '$sample': { 'size': k } }, { '$project': { '_id': 0 } }])) def insert(self, card): return self._cards.insert_one(card) def lookup(self, filter, projection=None): if projection is None: projection = {} projection['_id'] = False result = self._cards.find_one(filter, projection=projection) return result
def check_perms_for_patch(resource, request, lookup): """ Checks the object in the database to ensure that the user has permission to modify the existing data. Does not check user's permissions for new data - use pre-insert methods to check new data. """ rsc = resource[:-6] # take off "_write" from the end of the string current_app.logger.info( 'Checking permissions for user {} to update {} object.'.format( g.username, rsc)) oid = request.url.split('{}/'.format(resource))[1] updates = json.loads(request.data) pipeline = [] pipeline.append({"$match": {"_id": ObjectId(oid=oid)}}) pipeline = redact_field('', pipeline) # Security at the top level sec_enabled_fields = get_security_enabled_fields(rsc) for key in updates.keys(): if key in sec_enabled_fields: pipeline = redact_field(key, pipeline) # Get object from DB, confirm user has permissions to update the fields in the PATCH coll = MongoClient(current_app.config.get('MONGO_HOST'), 27017)[current_app.config.get('MONGO_DBNAME')][rsc] agg_result = list(coll.aggregate(pipeline)) if len(agg_result) > 0: agg_result = agg_result[0] else: agg_result = {} # Check document level if "false" in agg_result.get('cat_matches') or "false" in agg_result.get( 'diss_matches'): current_app.logger.info( 'User {} has insufficient permissions to modify data in the {} object' .format(g.username, rsc)) abort(403) # Check field level for all requested fields for key in updates.keys(): val = agg_result.get(key) if type(val) == dict: if "false" in val.get('cat_matches', []): current_app.logger.info( 'User {} has insufficient permissions to modify data in the {} object' .format(g.username, rsc)) abort(403) if "false" in val.get('diss_matches', []): current_app.logger.info( 'User {} has insufficient permissions to modify data in the {} object' .format(g.username, rsc)) abort(403) current_app.logger.info( 'User {} has sufficient permissions to modify data in the {} object.'. format(g.username, rsc))
class Database: def __init__(self): self.phrases = MongoClient( config.environ['database_clear']).bots.phrases def answer(self, text): answers = [] for word in text.split(): answers += self.phrases.aggregate([{ '$match': { 'question': { '$regex': word } } }, { '$sample': { 'size': 1 } }]) if not answers: return 'Не понимаю тебя' return random.choice(list(answers))['message'] @staticmethod def is_triggered(text): if random.randint(1, 100) == 1: return True for name in {'loshadkin', 'пасюк', 'лошадкин'}: if name in text.lower(): return True return False def process_message(self, m): try: _id = self.phrases.find({}).sort({'_id': -1}).limit(1)['_id'] + 1 question = m.reply_to_message.text if m.reply_to_message else m.text doc = { '_class': 'Phrase', '_id': _id, 'question': question, 'message': m.text } self.phrases.insert_one(doc) except: pass
def main(): coll = MongoClient().db.taxi11 agg = coll.aggregate([{'$sample': {'size': 500}}]) i = 0 for sample in agg: # print sample lat = sample['pickupLatitude'] lon = sample['pickupLongitude'] print lat print lon res = coll.find({ 'pickupLatitude': { '$gt': lat - 0.0001, '$lt': lat + 0.0001 }, 'pickupLongitude': { '$gt': lon - 0.0001, '$lt': lon + 0.0001 } }) print res.count() if res.count() >= 10: file = open('data2/dat{}'.format(i), 'w') l = [] bf = False for r in res: if r['pickupLatitude'] < 30 or r['pickupLongitude'] > -60 or r[ 'dropoffLatitude'] < 30 or r['dropoffLongitude'] > -60: bf = True break l.append(r) if bf: print 'break' continue pickle.dump(l, file) i += 1 file.close()
class DatabaseController: """ A class that creates a mongo database with methods that that update to aggregates information from the database. """ def __init__(self): self.post = MongoClient('localhost', 27017).client.fundingplatforms.posts def update(self, args): new_posts = [ { 'platform' : arg[0], 'title' : arg[1], 'summary' : arg[2], 'link' : arg[3], 'raised' : arg[4], 'pct_raised' : arg[5], 'days_remain' : arg[6] } for arg in args ] for doc in new_posts: self.post.update_one({ 'title' : doc['title'] },{ '$set' : doc }, upsert=True) def raised(self, group_by=None, min_days = 10): if group_by == None: pipeline = [ { "$match" : { "days_remain" : { "$gte" : min_days }}}, {"$group": {"_id": None, "total" : {"$sum": "$raised"}}} ] elif group_by == 'platform': pipeline = [ { "$match" : { "days_remain" : { "$gte" : min_days }}}, {"$group": {"_id": "$platform", "total" : {"$sum": "$raised"}}} ] else: raise ValueError("group_by must be None or 'platform'") agg = list(self.post.aggregate(pipeline)) return { agg[i]['_id'] : agg[i]['total'] for i in range(len(agg)) }
def __init__(self, server: str, db: str, collection: str, text_field, **kwargs): fields = {'text': ('text', text_field)} col = MongoClient(f'mongodb://{server}/')[db][collection] cursor_text = col.aggregate([{ '$unwind': '$text' }, { '$project': { 'text': 1, '_id': 0 } }]) examples = [make_example(i, fields) for i in cursor_text] if isinstance(fields, dict): fields, field_dict = [], fields for field in field_dict.values(): if isinstance(field, list): fields.extend(field) else: fields.append(field) super(Yelp2019, self).__init__(examples, fields, **kwargs) self.max_len = max([len(f.text) for f in self.examples])
class Database: def __init__(self): self.connect_url = "mongodb://{}:{}@{}:{}/".format( USERNAME, PASSWORD, MONGODB_HOST, MONGODB_PORT) self.client = None def connect(self): try: self.client = MongoClient(self.connect_url) self.client = self.client[DATABASE_NAME] # Selecting DB self.client = self.client[COLLECTION_NAME] # Selecting Collection return [True, "Success"] except errors.ServerSelectionTimeoutError: return [False, "Failed to Connect DB"] except errors.ConfigurationError: return [False, "Configurarion Error"] except errors.ConnectionFailure: return [False, "Connection Failure"] def list_documents(self): try: cursor = self.client.find({}, {'_id': False}) return [True, "Success", cursor] except Exception: return [False, "Internal Error"] def create_document(self, document): try: self.client.insert_one(document) return [True, "Success"] except errors.DuplicateKeyError: return [False, "The config name is already exist"] def get_document(self, doc): try: cursor = self.client.find_one({"name": doc}, {'_id': False}) if cursor is not None: return [True, "Success", cursor] else: return [False, "No document found"] except Exception as e: return [False, "Internal Error"] def update_document(self, config_name, data): try: result = self.client.replace_one({"name": config_name}, { "name": config_name, "data": data }) if result.acknowledged: return [True, "Success", result] else: raise Exception except Exception: return [False, "Internal Error"] def purge_document(self, document): try: cursor = self.client.delete_one({"name": document}) return [True, "Success", cursor.deleted_count] except Exception: return [False, "Internal Error"] def query(self, key, val, config_name=None): if config_name is None: query = '[{{"$match": {{}}}}, {{"$unwind":"$data"}}, {{"$match":{{"data.{0}": "{1}"}}}}]' replaced_query = json.loads(query.format(key, val)) else: query = '[{{"$match": {{"name": "{0}"}}}}, {{"$unwind":"$data"}}, {{"$match":{{"data.{1}": "{2}"}}}}]' replaced_query = json.loads(query.format(config_name, key, val)) try: cursor = self.client.aggregate(replaced_query) if cursor is not None: return [True, "Success", cursor] else: return [False, "No document found"] except Exception as e: return [False, str(e)]
'$gte': start_num, '$lt': end_num }, "shop.name": { '$not': regex }, 'source': { '$ne': 'lazycat' } }) pool.map(move_express, [e_cursor]) # ==> 迁移expr_trace pipe = [{ "$match": { "expr_num": { "$gte": start_num, '$lt': end_num } } }, { "$group": { '_id': '$expr_num', "expr_num": { '$first': '$expr_num' } } }] t_cursor = mc_trace_old.aggregate(pipeline=pipe) pool.map(move_trace, [t_cursor])
def dta_select_parser(in_file, small=False, get_tax=False, check_peptides=False, get_hashes=False, return_reverse=True, taxDB=None, protDB=None): """ steps through and parses a DTASelect-filter.txt file (generator function) :param in_file: path to DTASelect-filter.txt file :param small: get rid of keys: 'loci' and 'peptides' :param get_tax: Look up taxonomy information for each protDB ID :param check_peptides: Check if all peptide sequences are in all proteins sequences in 'forward_loci' :param get_hashes: :param return_reverse: include reverse loci :pararm taxDB: mongo collection to the taxDB (containing protdbID -> taxid mapping) or None :param protDB: mongo collection to protDB (protdbID information). If protDB is given, a better protein['name'] will be attempted :type in_file: str get_forward_loci is removed noProtDB is removed. If the locus contains a "||", the part before the pipes is determined to be the protID protein is a dict with (possible) fields: loci: list of locus dicts. described below peptides: list of peptide dicts. described below reverse: boolean. True if all loci for a protein are reverse peptide_seq: set. All peptide amino acid sequences forward_loci: list. 'Locus' fields in loci for forward loci all_loci: list. 'Locus' fields in loci name: string. The "representative" locus (the largest one by AA length) tax_id: list. Unique list of taxonomy IDs for all forward loci lca: Int or None. Lowest common ancestor of tax_ids hashes: list. MD5sums of protein sequences for forward_loci. len(hashes) gives the number of unique proteins matching loci: dict parsed from Loci lines in file. Mostly unchanged fields from file: Locus, Sequence Count, Spectrum Count, Sequence Coverage, Length, MolWt, pI, Validation Status, NSAF, EMPAI, Descriptive Name fields added: reverse: boolean. True if `locus` starts with "Reverse_" description: part after'||', if exists peptides: dict parsed from peptide lines in file. Mostly unchanged. fields from file: Unique, FileName, XCorr, DeltCN, Conf%, M+H+m CalcM+H+, TotalIntensity, SpR, SpScore, IonProportion, Redundancy, Sequence fields added: aa_sequence: `Sequence` with the left and right sequence stripped is_modified: boolean. True if the peptide has PTMs unmod_peptide: peptide sequence without PTMs diff_mass: mass difference of PTMs mods: list of tuples: (AA (amino acid that is modified), pos (1-based position within peptide), mass (mass of this PTM)) lc_step, scan, charge_state: parsed from `FileName` """ def per_to_float(x): # '50%' -> 0.50 return float(x[:-1]) / 100 if get_tax: from ..analysis import taxonomy from pymongo import MongoClient if not taxDB: taxDB = MongoClient('wl-cmadmin.scripps.edu', 27017).TaxDB_072114.TaxDB_072114 t = taxonomy.Taxonomy(host='wl-cmadmin.scripps.edu', port=27017) if check_peptides: protDB = MongoClient('wl-cmadmin.scripps.edu', 27018).ProtDB_072114.ProtDB_072114 if get_hashes: client = MongoClient('wl-cmadmin.scripps.edu', 27017) redunDB = client.redunDB.redunDB locus_types = [str, int, int, per_to_float, int, int, float, str, float, float, str] peptide_types = [str, str, float, float, float, float, float, float, int, float, float, int, str] peptide_types_ppm = [str, str, float, float, float, float, float, float, float, int, float, float, int, str] with open(in_file) as f: line = next(f) # Skip header. # TODO: parse it while not line.startswith('Locus'): line = next(f) if "--dm" in line: peptide_types = peptide_types_ppm # Read locus header locus_columns = line.strip().split('\t') # Read peptide header line = next(f) peptide_columns = line.rstrip().split('\t') line = next(f) # Read the rest of the file while line != '\tProteins\tPeptide IDs\tSpectra\n': protein = dict() # Read loci for a protein loci = [] while not line.startswith('\t') and not line.startswith('*'): # While it starts with a number or "Rev' try: loci.append(dict(zip(locus_columns, [x(y) for x, y in zip(locus_types, line.strip().split('\t'))]))) except: print("parsing line failed: " + line) line = next(f) # Read peptides peptides = [] while line[0] in ['\t', '*'] and line != '\tProteins\tPeptide IDs\tSpectra\n': peptides.append( dict(zip(peptide_columns, [x(y) for x, y in zip(peptide_types, line.rstrip().split('\t'))]))) line = next(f) protein['peptides'] = peptides protein['loci'] = loci # Parse out reverse loci for l in protein['loci']: if l['Locus'].startswith('Reverse_'): l['reverse'] = True l['Locus'] = l['Locus'][8:] else: l['reverse'] = False if l['Locus'].count("||"): locus_split = l['Locus'].split('||') l['description'] = '||'.join(locus_split[1:]) l['Locus'] = int(locus_split[0]) else: l['description'] = l['Locus'] l['Locus'] = int(l['Locus']) # Are all loci for a protein reverse? ## - shouldn't this logic be changed to: 'are any loci reverse?' and if so, set 'Reverse' to True? ## (because then we can't distinguish this peptide match from a fictional protein from a real protein) ## - There aren't typically many overlapping peptides between forward and reverse proteins anyway (~1%) if all([l['reverse'] for x in protein['loci']]): protein['reverse'] = True else: protein['reverse'] = False if not return_reverse and protein['reverse']: # skip this one if we are skipping reverse loci continue # Pull out peptide sequences for p in protein['peptides']: p['aa_sequence'] = re.findall('\.(.*)\.', p['Sequence'])[0] p['is_modified'] = True if ')' in p['aa_sequence'] else False if p['is_modified']: seq, mods = get_unmod_seq(p['aa_sequence']) p.update(mods) else: p['unmod_peptide'] = p['aa_sequence'] # MudPIT salt step (chromatography method from Xcalibur) p['lc_step'] = get_lcstep(p['FileName']) # Scan number from instrument (unique per salt step) - from MS2 / SQT file p['scan'] = int(p['FileName'].split('.')[1]) # predicted ion charge from instrument - from MS2 / SQT file p['charge_state'] = int(p['FileName'].split('.')[3]) # To try to not break things p['LCStep'] = p['lc_step'] p['Scan'] = p['scan'] p['ChargeState'] = p['charge_state'] p['AA_Sequence'] = p['aa_sequence'] p['isModified'] = p['is_modified'] protein['peptide_seq'] = list(set((x['aa_sequence'] for x in protein['peptides']))) protein['unmod_peptide_seq'] = list(set((x['unmod_peptide'] for x in protein['peptides']))) protein['quantification'] = sum([x['Redundancy'] for x in protein['peptides']]) protein['forward_loci'] = [l['Locus'] for l in protein['loci'] if not l['reverse']] protein['all_loci'] = [l['Locus'] for l in protein['loci']] def is_good_db(s): s = s.lower() if "refseq" in s or "uniprot" in s or s=="hmp_reference_genomes": return True else: return False # get a "representative" locus if protDB: # pick the largest protein in one of dbs: ['RefSeq','UniProt*', 'HMP_Reference_Genomes'] p_result = [x for x in protDB.find({'_id':{'$in':protein['forward_loci']}}) if is_good_db(x['r'])] if p_result: protein['name'] = max([(len(p['s']),p['d']) for p in p_result], key=lambda x:x[0])[1] if 'name' not in protein: # the largest one in any db max_length = max(l['Length'] for l in protein['loci']) protein['name'] = [l['description'] for l in protein['loci'] if l['Length'] == max_length][0] if get_tax: # get all possible taxIDs protDB_ids = protein['forward_loci'] assert all(isinstance(x, int) for x in protDB_ids) taxIDs_doc = list(taxDB.aggregate( [{'$match': {'_id': {'$in': protDB_ids}}}, {'$group': {'_id': None, 'taxid': {'$addToSet': '$taxid'}}}])) if taxIDs_doc: protein['tax_id'] = [x for x in taxIDs_doc[0]['taxid'] if x] protein['lca'] = t.LCA(taxIDs_doc[0]['taxid']) else: protein['tax_id'] = [] protein['lca'] = None # To try to not break things protein['LCA'] = protein['lca'] if check_peptides: # Are all peptides found within the fasta sequences for all possible forward_loci ? # Skip reverse loci. May want to change this to use all loci, regardless of forward or reverse # to avoid some proteins not having these entries. # Keeping like this for now to keep compatibility with get_forward_loci lookup if protein['forward_loci']: protein['protDB'] = list(protDB.find({'_id': {'$in': protein['forward_loci']}})) defline, seq = zip(*[(x['d'], x['s']) for x in protein['protDB']]) protein['all_peptides_in_proteins'] = all( [all([p in s for p in protein['peptide_seq']]) for s in seq]) if not protein['all_peptides_in_proteins']: print('not all peptides in proteins' + str(protein['forward_loci'][0])) if get_hashes: protein['hashes'] = [x['_id'] for x in redunDB.find({'pID': {'$in': protein['forward_loci']}})] if small: protein['descriptions'] = [l['description'] for l in protein['loci']] del protein['loci'] del protein['peptides'] yield protein
# 3rd: query mongodb cluster # sql = "select * from `analytics.1` limit 2" # # pipeline = [ # { # '$sql': { # 'statement': sql, # 'format': "jdbc", # 'dialect': "mysql", # } # } # ] # # r = conn.aggregate(pipeline) # pprint(list(r)) # 4th: query S3 via atlas data lake sql = "select * from `clickstream` limit 2" pipeline = [{ '$sql': { 'statement': sql, 'format': "jdbc", 'dialect': "mysql", } }] r = conn.aggregate(pipeline) pprint(list(r))
from pymongo import MongoClient import pprint from pymongo import ASCENDING db = MongoClient().get_database("DATA").get_collection("Twitter_Breixt_9month") cur = db.aggregate([{ "$group": { "_id": { "id": "$id" }, "uniqueIds": { "$addToSet": "$_id" }, "count": { "$sum": 1 } } }, { "$match": { "count": { "$gt": 1 } } }], allowDiskUse=True) duplicateIds = list(cur) pprint.pprint(duplicateIds) raw_input("Any button to remove") for doc in duplicateIds: index = 1 print doc["uniqueIds"] while index < doc["uniqueIds"].length: db.delete_one(doc["uniqueIds"][index]) index += 1 print index print print db.createIndex({"id":ASCENDING},unique=True) print "Done"
} }, { "$match": { "numPublications": { "$gt": 1 } } }, { "$sort": { "numPublications": 1 } }, { "$limit": 2 }] pprint.pprint(list(article.aggregate(pipeline_abstracts))) # journal ranking by number of publications print("Nombre de publications par journal") pipeline_journal = [{ "$group": { "_id": { "journal": "$journal" }, "numPublications": { "$sum": 1 } } }, { "$match": {
indices_file = "indices_" + brexit + ".json" graph = "g_" + brexit + ".graphml" if False: limit = 2000 hashtag_key = Status.SCHEMA_MAP[schema_id]["hashtags"] top_user_query = [ {"$match": {Status.SCHEMA_MAP[schema_id]["retweeted_status"]: {"$exists": False}, "lang": "en"}}, {"$unwind": "$" + hashtag_key}, {"$group": {"_id": {"$toLower": '$' + hashtag_key + '.' + 'text'}, "count": {"$sum": 1}}}, {"$sort": {"count": -1}}, {"$limit": limit}] top_hastags = db_col.aggregate(top_user_query, allowDiskUse=True) a = list(top_hastags) print a with open(top_hashtag_file, "w") as f: json.dump(a, f) elif False: top_hasthag_count = json.load(open(top_hashtag_file)) top_hasthag_count = sorted(top_hasthag_count, key=lambda x: x["count"], reverse=True) top_hashtags = set([i["_id"] for i in top_hasthag_count if i["_id"]]) # not in ["ivoted", "brexit"]]) print top_hasthag_count print top_hashtags n = len(top_hashtags) coocurences_array = np.zeros([n,n], dtype=int)
'''il primo il nome del campo che viene visualizzato. Il secondo il nome del campo che si prende dalla collezione entrante''' doc.add('year', 'year') doc.add('make', 'make') group1 = GroupDocument(doc) group1.addsum('numberOfProducedModels', 1) doc2 = Document() doc2.add('year', '_id.year') doc2.add('numberOfProducedModels', 'numberOfProducedModels') group2 = GroupDocument(doc2) group2.addpush('makers', '_id.make') sort = SortDocument() limit = LimitDocument(10) sort.addfield('_id.numberOfProducedModels') out = OutDocument('outcollpycharmsortedlimit') agg.append(group1) agg.append(group2) agg.append(limit) agg.append(sort) agg.append(out) print agg print json.dumps(coll.aggregate(agg), indent=4)
''' from pymongo import MongoClient if __name__ == '__main__': c = MongoClient('mongodb://localhost:27017').logs.nginx print(f'{c.count_documents({})} logs') print('Methods:') print(f'\tmethod GET: {c.count_documents({"method": "GET"})}') print(f'\tmethod POST: {c.count_documents({"method": "POST"})}') print(f'\tmethod PUT: {c.count_documents({"method": "PUT"})}') print(f'\tmethod PATCH: {c.count_documents({"method": "PATCH"})}') print(f'\tmethod DELETE: {c.count_documents({"method": "DELETE"})}') print(f'{c.count_documents({"path": "/status"})} status check') print('IPs:') ips = c.aggregate([{ '$group': { '_id': '$ip', 'count': { '$sum': 1 } } }, { '$sort': { 'count': -1 } }, { '$limit': 10 }]) for ip in ips: print(f'\t{ip.get("_id")}: {ip.get("count")}')
tomorrow = tomorrow - dt.timedelta(days=1) today = tomorrow - dt.timedelta(days=1) name = f'news/{today.year}-{0 if today.month < 10 else ""}{today.month}-{0 if today.day < 10 else ""}{today.day}.npy' if not os.path.exists(name): print(today, tomorrow) # ms = messages.find({'_date': {'$gte': today, '$lt': tomorrow}, 'elmo': {'$exists': True}}) ms = messages.aggregate([ { '$match': { '_date': { '$gte': today, '$lt': tomorrow }, 'elmo': { '$exists': True } } }, # {'$match': {'elmo': {'$exists': True}}}, { '$sample': { 'size': 64 } } ]) elmo = [pickle.loads(m['elmo']) for m in ms] if not elmo: continue elmo = np.stack(elmo).astype(np.float32) np.save(name, elmo) # t0 = time.time()
def dta_select_parser(in_file, small=False, get_tax=False, check_peptides=False, get_hashes=False, return_reverse=True, taxDB=None, protDB=None): """ steps through and parses a DTASelect-filter.txt file (generator function) :param in_file: path to DTASelect-filter.txt file :param small: get rid of keys: 'loci' and 'peptides' :param get_tax: Look up taxonomy information for each protDB ID :param check_peptides: Check if all peptide sequences are in all proteins sequences in 'forward_loci' :param get_hashes: :param return_reverse: include reverse loci :pararm taxDB: mongo collection to the taxDB (containing protdbID -> taxid mapping) or None :param protDB: mongo collection to protDB (protdbID information). If protDB is given, a better protein['name'] will be attempted :type in_file: str get_forward_loci is removed noProtDB is removed. If the locus contains a "||", the part before the pipes is determined to be the protID protein is a dict with (possible) fields: loci: list of locus dicts. described below peptides: list of peptide dicts. described below reverse: boolean. True if all loci for a protein are reverse peptide_seq: set. All peptide amino acid sequences forward_loci: list. 'Locus' fields in loci for forward loci all_loci: list. 'Locus' fields in loci name: string. The "representative" locus (the largest one by AA length) tax_id: list. Unique list of taxonomy IDs for all forward loci lca: Int or None. Lowest common ancestor of tax_ids hashes: list. MD5sums of protein sequences for forward_loci. len(hashes) gives the number of unique proteins matching loci: dict parsed from Loci lines in file. Mostly unchanged fields from file: Locus, Sequence Count, Spectrum Count, Sequence Coverage, Length, MolWt, pI, Validation Status, NSAF, EMPAI, Descriptive Name fields added: reverse: boolean. True if `locus` starts with "Reverse_" description: part after'||', if exists peptides: dict parsed from peptide lines in file. Mostly unchanged. fields from file: Unique, FileName, XCorr, DeltCN, Conf%, M+H+m CalcM+H+, TotalIntensity, SpR, SpScore, IonProportion, Redundancy, Sequence fields added: aa_sequence: `Sequence` with the left and right sequence stripped is_modified: boolean. True if the peptide has PTMs unmod_peptide: peptide sequence without PTMs diff_mass: mass difference of PTMs mods: list of tuples: (AA (amino acid that is modified), pos (1-based position within peptide), mass (mass of this PTM)) lc_step, scan, charge_state: parsed from `FileName` """ def per_to_float(x): # '50%' -> 0.50 return float(x[:-1]) / 100 if get_tax: from ..analysis import taxonomy from pymongo import MongoClient if not taxDB: taxDB = MongoClient('wl-cmadmin.scripps.edu', 27017).TaxDB_072114.TaxDB_072114 t = taxonomy.Taxonomy(host='wl-cmadmin.scripps.edu', port=27017) if check_peptides: protDB = MongoClient('wl-cmadmin.scripps.edu', 27018).ProtDB_072114.ProtDB_072114 if get_hashes: client = MongoClient('wl-cmadmin.scripps.edu', 27017) redunDB = client.redunDB.redunDB locus_types = [ str, int, int, per_to_float, int, int, float, str, float, float, str ] peptide_types = [ str, str, float, float, float, float, float, float, int, float, float, int, str ] peptide_types_ppm = [ str, str, float, float, float, float, float, float, float, int, float, float, int, str ] peptide_types_pI = [ str, str, float, float, float, float, float, float, int, float, float, float, int, str ] peptide_types_ppm_pI = [ str, str, float, float, float, float, float, float, float, int, float, float, float, int, str ] with open(in_file) as f: line = next(f) # Skip header. # TODO: parse it while not line.startswith('Locus'): line = next(f) if "--dm" in line and "--pI" in line: peptide_types = peptide_types_ppm_pI elif "--dm" in line: peptide_types = peptide_types_ppm elif "--pI" in line: peptide_types = peptide_types_pI # Read locus header locus_columns = line.strip().split('\t') # Read peptide header line = next(f) peptide_columns = line.rstrip().split('\t') line = next(f) # Read the rest of the file while line != '\tProteins\tPeptide IDs\tSpectra\n': protein = dict() # Read loci for a protein loci = [] while not line.startswith('\t') and not line.startswith( '*'): # While it starts with a number or "Rev' try: loci.append( dict( zip(locus_columns, [ x(y) for x, y in zip(locus_types, line.strip().split('\t')) ]))) except: print("parsing line failed: " + line) line = next(f) # Read peptides peptides = [] while line[0] in [ '\t', '*' ] and line != '\tProteins\tPeptide IDs\tSpectra\n': peptides.append( dict( zip(peptide_columns, [ x(y) for x, y in zip(peptide_types, line.rstrip().split('\t')) ]))) line = next(f) protein['peptides'] = peptides protein['loci'] = loci # Parse out reverse loci for l in protein['loci']: if l['Locus'].startswith('Reverse_'): l['reverse'] = True l['Locus'] = l['Locus'][8:] else: l['reverse'] = False if l['Locus'].count("||"): locus_split = l['Locus'].split('||') l['description'] = '||'.join(locus_split[1:]) l['Locus'] = int(locus_split[0]) else: l['description'] = l['Locus'] try: l['Locus'] = int(l['Locus']) except ValueError: l['Locus'] = 0 # Are all loci for a protein reverse? ## - shouldn't this logic be changed to: 'are any loci reverse?' and if so, set 'Reverse' to True? ## (because then we can't distinguish this peptide match from a fictional protein from a real protein) ## - There aren't typically many overlapping peptides between forward and reverse proteins anyway (~1%) if all([l['reverse'] for x in protein['loci']]): protein['reverse'] = True else: protein['reverse'] = False if not return_reverse and protein[ 'reverse']: # skip this one if we are skipping reverse loci continue # Pull out peptide sequences for p in protein['peptides']: p['aa_sequence'] = re.findall('\.(.*)\.', p['Sequence'])[0] p['is_modified'] = True if ')' in p['aa_sequence'] else False if p['is_modified']: seq, mods = get_unmod_seq(p['aa_sequence']) p.update(mods) else: p['unmod_peptide'] = p['aa_sequence'] # MudPIT salt step (chromatography method from Xcalibur) p['lc_step'] = get_lcstep(p['FileName']) # Scan number from instrument (unique per salt step) - from MS2 / SQT file p['scan'] = int(p['FileName'].split('.')[1]) # predicted ion charge from instrument - from MS2 / SQT file p['charge_state'] = int(p['FileName'].split('.')[3]) # To try to not break things p['LCStep'] = p['lc_step'] p['Scan'] = p['scan'] p['ChargeState'] = p['charge_state'] p['AA_Sequence'] = p['aa_sequence'] p['isModified'] = p['is_modified'] protein['peptide_seq'] = list( set((x['aa_sequence'] for x in protein['peptides']))) protein['unmod_peptide_seq'] = list( set((x['unmod_peptide'] for x in protein['peptides']))) protein['quantification'] = sum( [x['Redundancy'] for x in protein['peptides']]) protein['forward_loci'] = [ l['Locus'] for l in protein['loci'] if not l['reverse'] ] protein['all_loci'] = [l['Locus'] for l in protein['loci']] def is_good_db(s): s = s.lower() if "refseq" in s or "uniprot" in s or s == "hmp_reference_genomes": return True else: return False # get a "representative" locus if protDB: # pick the largest protein in one of dbs: ['RefSeq','UniProt*', 'HMP_Reference_Genomes'] p_result = [ x for x in protDB.find( {'_id': { '$in': protein['forward_loci'] }}) if is_good_db(x['r']) ] if p_result: protein['name'] = max([(len(p['s']), p['d']) for p in p_result], key=lambda x: x[0])[1] if 'name' not in protein: # the largest one in any db max_length = max(l['Length'] for l in protein['loci']) protein['name'] = [ l['description'] for l in protein['loci'] if l['Length'] == max_length ][0] if get_tax: # get all possible taxIDs protDB_ids = protein['forward_loci'] assert all(isinstance(x, int) for x in protDB_ids) taxIDs_doc = list( taxDB.aggregate([{ '$match': { '_id': { '$in': protDB_ids } } }, { '$group': { '_id': None, 'taxid': { '$addToSet': '$taxid' } } }])) if taxIDs_doc: protein['tax_id'] = [ x for x in taxIDs_doc[0]['taxid'] if x ] protein['lca'] = t.LCA(taxIDs_doc[0]['taxid']) else: protein['tax_id'] = [] protein['lca'] = None # To try to not break things protein['LCA'] = protein['lca'] if check_peptides: # Are all peptides found within the fasta sequences for all possible forward_loci ? # Skip reverse loci. May want to change this to use all loci, regardless of forward or reverse # to avoid some proteins not having these entries. # Keeping like this for now to keep compatibility with get_forward_loci lookup if protein['forward_loci']: protein['protDB'] = list( protDB.find({'_id': { '$in': protein['forward_loci'] }})) defline, seq = zip(*[(x['d'], x['s']) for x in protein['protDB']]) protein['all_peptides_in_proteins'] = all([ all([p in s for p in protein['peptide_seq']]) for s in seq ]) if not protein['all_peptides_in_proteins']: print('not all peptides in proteins' + str(protein['forward_loci'][0])) if get_hashes: protein['hashes'] = [ x['_id'] for x in redunDB.find( {'pID': { '$in': protein['forward_loci'] }}) ] if small: protein['descriptions'] = [ l['description'] for l in protein['loci'] ] del protein['loci'] del protein['peptides'] yield protein
from datetime import timedelta db_col = MongoClient().get_database("DATA").get_collection("Brexit_old") query = [ {"$match":{"retweetedStatus":{"$exists":True}}}, {"$group":{"_id":{"id":"$retweetedStatus.id","date":"$retweetedStatus.createdAt", "name":"$retweetedStatus.user.screenName"}, "retweets":{"$push":"$createdAt"}}} ] # {"$match":{"retweets.length":{"$gte":2}}} cursor = db_col.aggregate(query, allowDiskUse = True) l = {} # for i,c in enumerate(cursor): retweets = c["retweets"] if len(retweets) < 10: continue ll = [] bb = {} l[c["_id"]["id"]] = bb original_date = c["_id"]["date"] for d in retweets: dif = d - original_date # ll.append(dif) dif_seconds =(divmod(dif.seconds, 60)[0])
Created on Fri Jun 3 15:23:06 2016 @author: xinruyue """ from pymongo import MongoClient import sys reload(sys) sys.setdefaultencoding('utf8') userAttr = MongoClient("10.8.8.111:27017")['cache']['userAttr'] #user "from" to split data platform = ['teacher','mobile','ios','pc','android'] #get loc data loc_data = [] for each in platform: pipeline = [ {"$match":{"from":each}}, {"$group":{"_id":"None","location":{"$push":"$location"}}}] loc_data += list(userAttr.aggregate(pipeline))[0]['location'] print len(loc_data) #save data as csvfile with open ("loc_data.csv",'w') as ld: for each in loc_data: if each != None: ld.write(each + '\n') ld.close()
count = get_entity_count(argv[2]) current_highest_count = get_highest_count() if count == current_highest_count: if list( db.aggregate([{ '$match': { 'entities.count': current_highest_count } }, { '$project': { "entities": { '$filter': { "input": "$entities", "as": "entity", "cond": { '$eq': [ '$$entity.count', current_highest_count ] } } }, '_id': 0 } }]))[0]['entities'].__len__() == 1: print("I can't let you do this, mortal.") exit() db.update_one({"entities.name": argv[2]}, {'$inc': {
VIPUsers = list(users.aggregate(pipeline))[0]['user'] return VIPUsers startTime = datetime.datetime(2016,6,9,16) userId = vip_users(startTime) print len(userId) location = [] for each in userId: pipeline = [ {"$match":{"user":each}}, {"$group":{"_id":None,"loc":{"$push":"$location"}}}] result = list(userAttr.aggregate(pipeline)) if len(result) == 0: print each else: location += result[0]['loc'] print len(location) csvfile = file('VIPUser_map_1.csv','wb') writer = csv.writer(csvfile) location.sort() location_1 = [] map_data = {} writer.writerow(['loc','num'])
doc = Document() '''il primo il nome del campo che viene visualizzato. Il secondo il nome del campo che si prende dalla collezione entrante''' doc.add('year', 'year') doc.add('make', 'make') group1 = GroupDocument(doc) group1.addsum('numberOfProducedModels', 1) doc2 = Document() doc2.add('year', '_id.year') doc2.add('numberOfProducedModels', 'numberOfProducedModels') group2 = GroupDocument(doc2) group2.addpush('makers', '_id.make') sort = SortDocument() limit = LimitDocument(10) sort.addfield('_id.numberOfProducedModels') out = OutDocument('outcollpycharmsortedlimit') agg.append(group1) agg.append(group2) agg.append(limit) agg.append(sort) agg.append(out) print agg print json.dumps(coll.aggregate(agg), indent=4)