Exemple #1
0
class MongoCorpus(SimpleCorpus):
    """
        Corpus wrapper around a MongoDB collection.
        Subset corpus by setting a query. If "aggregate" is used,
        this will override "query". In this case use "$match" in
        aggregation method.
    """
    def __init__(self, db, collection, aggregate=[], query={}):
        self.client = MongoClient()[db][collection]
        self.aggregate_arg = aggregate
        self.find_arg = query

    def __iter__(self):
        """
            _obj_ is a dictionary: you can filter the right
            key to feed only docs text.
        """
        collection = self.client.find(self.find_arg, no_cursor_timeout=True) \
                        if len(self.aggregate_arg) == 0 \
                else self.client.aggregate(self.aggregate_arg)
        for doc in collection:
            yield doc

        collection.close()

    def __len__(self):
        if len(self.aggregate_arg) == 0:
            return self.client.find(self.find_arg).count()
        else:
            d = next(self.client.aggregate(self.aggregate_arg +
                        [{"$group": {"_id": "null", "count": {"$sum": 1}}}]))
            return d['count']
Exemple #2
0
class RemoteIO:
    def __init__(self):
        time_counter(print_to_console=False)
        print("初始化 RemoteIO")
        self.db = MongoClient('192.168.68.11', 20000).get_database(
            "tokenizer_qiao").get_collection('splited_sentences')
        self.sentence_size = self.db.find().count()
        self.step = self.sentence_size
        self.skip = 0
        time_counter("初始化完毕")

    def read_sentence_randomly(self):
        while self.skip + self.step >= self.sentence_size:
            print("skip:%d, step:%d, size:%d" %
                  (self.skip, self.step, self.sentence_size))
            if self.step == 0:
                return None
            self.skip = 0
            self.step = int(self.step / 2)
        if self.step + self.skip < self.sentence_size:
            random_step = random.randint(0, self.step)
            # print("获取 skip:%d" % self.skip+random_step)
            pipeline = [{"$skip": self.skip + random_step}, {"$limit": 1}]
            self.skip += random_step
            docs = list(self.db.aggregate(pipeline))
            doc = docs[0] if len(docs) > 0 else None
            self.db.update({"_id": doc["_id"]}, {"$inc": {"analysed": 1}})
            time_counter("已获取到")
            return doc
        else:
            return None

    def read_sentence_from_remote(self):
        db = self.db
        return db.find()
Exemple #3
0
class Count(object):
    """
    code field count
    """
    def __init__(self, code, year=None, type=None, factor=None):
        self.code = code
        self.year = year
        self.type = type
        self.factor = factor
        self.collection = MongoClient()["db5"]["values"]
        pass

    def aggregation(self):
        query = [{
            "$match": {
                "key": self.code,
                "year": self.year,
                "type": self.type
            }
        }, {
            "$group": {
                "_id": {
                    "code": "$key",
                    "year": "$year"
                },
                "total": {
                    "$sum": 1
                }
            }
        }]
        result = self.collection.aggregate(pipeline=query)
        return result

    def find(self):
        pass
Exemple #4
0
class MongoDbCardStorage(BaseCardStorage):
    def __init__(self, database, **kwargs):
        self._cards = MongoClient(**kwargs)[database].cards

    def sample(self, k):
        return list(
            self._cards.aggregate([{
                '$sample': {
                    'size': k
                }
            }, {
                '$project': {
                    '_id': 0
                }
            }]))

    def insert(self, card):
        return self._cards.insert_one(card)

    def lookup(self, filter, projection=None):
        if projection is None:
            projection = {}
        projection['_id'] = False
        result = self._cards.find_one(filter, projection=projection)
        return result
Exemple #5
0
def check_perms_for_patch(resource, request, lookup):
    """
    Checks the object in the database to ensure that the user has permission to modify the existing data.
    Does not check user's permissions for new data - use pre-insert methods to check new data.
    """
    rsc = resource[:-6]  # take off "_write" from the end of the string
    current_app.logger.info(
        'Checking permissions for user {} to update {} object.'.format(
            g.username, rsc))
    oid = request.url.split('{}/'.format(resource))[1]

    updates = json.loads(request.data)

    pipeline = []
    pipeline.append({"$match": {"_id": ObjectId(oid=oid)}})
    pipeline = redact_field('', pipeline)  # Security at the top level
    sec_enabled_fields = get_security_enabled_fields(rsc)
    for key in updates.keys():
        if key in sec_enabled_fields:
            pipeline = redact_field(key, pipeline)

    # Get object from DB, confirm user has permissions to update the fields in the PATCH
    coll = MongoClient(current_app.config.get('MONGO_HOST'),
                       27017)[current_app.config.get('MONGO_DBNAME')][rsc]

    agg_result = list(coll.aggregate(pipeline))
    if len(agg_result) > 0:
        agg_result = agg_result[0]
    else:
        agg_result = {}

    # Check document level
    if "false" in agg_result.get('cat_matches') or "false" in agg_result.get(
            'diss_matches'):
        current_app.logger.info(
            'User {} has insufficient permissions to modify data in the {} object'
            .format(g.username, rsc))
        abort(403)

    # Check field level for all requested fields
    for key in updates.keys():
        val = agg_result.get(key)
        if type(val) == dict:
            if "false" in val.get('cat_matches', []):
                current_app.logger.info(
                    'User {} has insufficient permissions to modify data in the {} object'
                    .format(g.username, rsc))
                abort(403)
            if "false" in val.get('diss_matches', []):
                current_app.logger.info(
                    'User {} has insufficient permissions to modify data in the {} object'
                    .format(g.username, rsc))
                abort(403)
    current_app.logger.info(
        'User {} has sufficient permissions to modify data in the {} object.'.
        format(g.username, rsc))
Exemple #6
0
class Database:
    def __init__(self):
        self.phrases = MongoClient(
            config.environ['database_clear']).bots.phrases

    def answer(self, text):
        answers = []
        for word in text.split():
            answers += self.phrases.aggregate([{
                '$match': {
                    'question': {
                        '$regex': word
                    }
                }
            }, {
                '$sample': {
                    'size': 1
                }
            }])
        if not answers:
            return 'Не понимаю тебя'
        return random.choice(list(answers))['message']

    @staticmethod
    def is_triggered(text):
        if random.randint(1, 100) == 1:
            return True
        for name in {'loshadkin', 'пасюк', 'лошадкин'}:
            if name in text.lower():
                return True
        return False

    def process_message(self, m):
        try:
            _id = self.phrases.find({}).sort({'_id': -1}).limit(1)['_id'] + 1
            question = m.reply_to_message.text if m.reply_to_message else m.text
            doc = {
                '_class': 'Phrase',
                '_id': _id,
                'question': question,
                'message': m.text
            }
            self.phrases.insert_one(doc)
        except:
            pass
def main():

    coll = MongoClient().db.taxi11
    agg = coll.aggregate([{'$sample': {'size': 500}}])
    i = 0
    for sample in agg:
        # print sample
        lat = sample['pickupLatitude']
        lon = sample['pickupLongitude']
        print lat
        print lon

        res = coll.find({
            'pickupLatitude': {
                '$gt': lat - 0.0001,
                '$lt': lat + 0.0001
            },
            'pickupLongitude': {
                '$gt': lon - 0.0001,
                '$lt': lon + 0.0001
            }
        })
        print res.count()
        if res.count() >= 10:
            file = open('data2/dat{}'.format(i), 'w')
            l = []
            bf = False
            for r in res:
                if r['pickupLatitude'] < 30 or r['pickupLongitude'] > -60 or r[
                        'dropoffLatitude'] < 30 or r['dropoffLongitude'] > -60:
                    bf = True
                    break
                l.append(r)
            if bf:
                print 'break'
                continue
            pickle.dump(l, file)
            i += 1
            file.close()
class DatabaseController:
    """
    A class that creates a mongo database with methods that that update to 
    aggregates information from the database.
    """
    def __init__(self):
        self.post = MongoClient('localhost', 27017).client.fundingplatforms.posts
    def update(self, args):
        new_posts = [ { 'platform' : arg[0],
                        'title' : arg[1],
                        'summary' : arg[2],
                        'link' : arg[3],
                        'raised' : arg[4],
                        'pct_raised' : arg[5],
                        'days_remain' : arg[6] }
                    for arg in args ]
        for doc in new_posts:
            self.post.update_one({ 'title' : doc['title'] },{ '$set' : doc }, upsert=True)
    
    def raised(self, group_by=None, min_days = 10):
        if group_by == None:
            pipeline = [
                        { "$match" : { "days_remain" : { "$gte" : min_days }}},
                        {"$group": {"_id": None, "total" : {"$sum": "$raised"}}}
                        ]
        elif group_by == 'platform':
            pipeline = [
                        { "$match" : { "days_remain" : { "$gte" : min_days }}},
                        {"$group": {"_id": "$platform", "total" : {"$sum": "$raised"}}}
                        ]
        
        else: 
            raise ValueError("group_by must be None or 'platform'")
            
        agg = list(self.post.aggregate(pipeline))
        return { agg[i]['_id'] : agg[i]['total'] for i in range(len(agg)) }
Exemple #9
0
    def __init__(self, server: str, db: str, collection: str, text_field,
                 **kwargs):
        fields = {'text': ('text', text_field)}
        col = MongoClient(f'mongodb://{server}/')[db][collection]
        cursor_text = col.aggregate([{
            '$unwind': '$text'
        }, {
            '$project': {
                'text': 1,
                '_id': 0
            }
        }])
        examples = [make_example(i, fields) for i in cursor_text]

        if isinstance(fields, dict):
            fields, field_dict = [], fields
            for field in field_dict.values():
                if isinstance(field, list):
                    fields.extend(field)
                else:
                    fields.append(field)

        super(Yelp2019, self).__init__(examples, fields, **kwargs)
        self.max_len = max([len(f.text) for f in self.examples])
Exemple #10
0
class Database:
    def __init__(self):
        self.connect_url = "mongodb://{}:{}@{}:{}/".format(
            USERNAME, PASSWORD, MONGODB_HOST, MONGODB_PORT)
        self.client = None

    def connect(self):
        try:
            self.client = MongoClient(self.connect_url)
            self.client = self.client[DATABASE_NAME]  # Selecting DB
            self.client = self.client[COLLECTION_NAME]  # Selecting Collection
            return [True, "Success"]
        except errors.ServerSelectionTimeoutError:
            return [False, "Failed to Connect DB"]
        except errors.ConfigurationError:
            return [False, "Configurarion Error"]
        except errors.ConnectionFailure:
            return [False, "Connection Failure"]

    def list_documents(self):
        try:
            cursor = self.client.find({}, {'_id': False})
            return [True, "Success", cursor]
        except Exception:
            return [False, "Internal Error"]

    def create_document(self, document):
        try:
            self.client.insert_one(document)
            return [True, "Success"]
        except errors.DuplicateKeyError:
            return [False, "The config name is already exist"]

    def get_document(self, doc):
        try:
            cursor = self.client.find_one({"name": doc}, {'_id': False})
            if cursor is not None:
                return [True, "Success", cursor]
            else:
                return [False, "No document found"]
        except Exception as e:
            return [False, "Internal Error"]

    def update_document(self, config_name, data):
        try:
            result = self.client.replace_one({"name": config_name}, {
                "name": config_name,
                "data": data
            })
            if result.acknowledged:
                return [True, "Success", result]
            else:
                raise Exception
        except Exception:
            return [False, "Internal Error"]

    def purge_document(self, document):
        try:
            cursor = self.client.delete_one({"name": document})
            return [True, "Success", cursor.deleted_count]
        except Exception:
            return [False, "Internal Error"]

    def query(self, key, val, config_name=None):
        if config_name is None:
            query = '[{{"$match": {{}}}}, {{"$unwind":"$data"}}, {{"$match":{{"data.{0}": "{1}"}}}}]'
            replaced_query = json.loads(query.format(key, val))
        else:
            query = '[{{"$match": {{"name": "{0}"}}}}, {{"$unwind":"$data"}}, {{"$match":{{"data.{1}": "{2}"}}}}]'
            replaced_query = json.loads(query.format(config_name, key, val))
        try:
            cursor = self.client.aggregate(replaced_query)
            if cursor is not None:
                return [True, "Success", cursor]
            else:
                return [False, "No document found"]
        except Exception as e:
            return [False, str(e)]
            '$gte': start_num,
            '$lt': end_num
        },
        "shop.name": {
            '$not': regex
        },
        'source': {
            '$ne': 'lazycat'
        }
    })
    pool.map(move_express, [e_cursor])

    # ==> 迁移expr_trace
    pipe = [{
        "$match": {
            "expr_num": {
                "$gte": start_num,
                '$lt': end_num
            }
        }
    }, {
        "$group": {
            '_id': '$expr_num',
            "expr_num": {
                '$first': '$expr_num'
            }
        }
    }]
    t_cursor = mc_trace_old.aggregate(pipeline=pipe)
    pool.map(move_trace, [t_cursor])
Exemple #12
0
def dta_select_parser(in_file, small=False, get_tax=False, check_peptides=False, get_hashes=False, 
                      return_reverse=True, taxDB=None, protDB=None):
    """ steps through and parses a DTASelect-filter.txt file (generator function)
    :param in_file: path to DTASelect-filter.txt file
    :param small: get rid of keys: 'loci' and 'peptides'
    :param get_tax: Look up taxonomy information for each protDB ID
    :param check_peptides: Check if all peptide sequences are in all proteins sequences in 'forward_loci'
    :param get_hashes:
    :param return_reverse: include reverse loci
    :pararm taxDB: mongo collection to the taxDB (containing protdbID -> taxid mapping) or None
    :param protDB: mongo collection to protDB (protdbID information). If protDB is given, a better protein['name'] will be attempted
    :type in_file: str

        get_forward_loci is removed
        noProtDB is removed. If the locus contains a "||", the part before the pipes is determined to be the protID

    protein is a dict with (possible) fields:
        loci: list of locus dicts. described below
        peptides: list of peptide dicts. described below
        reverse: boolean. True if all loci for a protein are reverse
        peptide_seq: set. All peptide amino acid sequences
        forward_loci: list. 'Locus' fields in loci for forward loci
        all_loci: list. 'Locus' fields in loci
        name: string. The "representative" locus (the largest one by AA length)
        tax_id: list. Unique list of taxonomy IDs for all forward loci
        lca: Int or None. Lowest common ancestor of tax_ids
        hashes: list. MD5sums of protein sequences for forward_loci. len(hashes) gives the number of unique proteins matching

    loci: dict parsed from Loci lines in file. Mostly unchanged
        fields from file: Locus, Sequence Count, Spectrum Count, Sequence Coverage, Length, MolWt, pI, Validation Status, NSAF, EMPAI, Descriptive Name
        fields added:
            reverse: boolean. True if `locus` starts with "Reverse_"
            description: part after'||', if exists
    peptides: dict parsed from peptide lines in file. Mostly unchanged.
        fields from file: Unique, FileName, XCorr, DeltCN, Conf%, M+H+m CalcM+H+, TotalIntensity, SpR, SpScore, IonProportion, Redundancy, Sequence
        fields added:
            aa_sequence: `Sequence` with the left and right sequence stripped
            is_modified: boolean. True if the peptide has PTMs
            unmod_peptide: peptide sequence without PTMs
            diff_mass: mass difference of PTMs
            mods: list of tuples: (AA (amino acid that is modified), pos (1-based position within peptide), mass (mass of this PTM))
            lc_step, scan, charge_state:  parsed from `FileName`

    """

    def per_to_float(x):
        # '50%' -> 0.50
        return float(x[:-1]) / 100

    if get_tax:
        from ..analysis import taxonomy
        from pymongo import MongoClient
        if not taxDB:
            taxDB = MongoClient('wl-cmadmin.scripps.edu', 27017).TaxDB_072114.TaxDB_072114
        t = taxonomy.Taxonomy(host='wl-cmadmin.scripps.edu', port=27017)

    if check_peptides:
        protDB = MongoClient('wl-cmadmin.scripps.edu', 27018).ProtDB_072114.ProtDB_072114

    if get_hashes:
        client = MongoClient('wl-cmadmin.scripps.edu', 27017)
        redunDB = client.redunDB.redunDB
        
        
    locus_types = [str, int, int, per_to_float, int, int, float, str, float, float, str]
    peptide_types = [str, str, float, float, float, float, float, float, int, float, float, int, str]
    peptide_types_ppm = [str, str, float, float, float, float, float, float, float, int, float, float, int, str]
    with open(in_file) as f:
        line = next(f)

        # Skip header.
        # TODO: parse it
        while not line.startswith('Locus'):
            line = next(f)
            if "--dm" in line:
                peptide_types = peptide_types_ppm

        # Read locus header
        locus_columns = line.strip().split('\t')

        # Read peptide header
        line = next(f)
        peptide_columns = line.rstrip().split('\t')
        line = next(f)

        # Read the rest of the file
        while line != '\tProteins\tPeptide IDs\tSpectra\n':
            protein = dict()
            # Read loci for a protein
            loci = []
            while not line.startswith('\t') and not line.startswith('*'):  # While it starts with a number or "Rev'
                try:
                    loci.append(dict(zip(locus_columns, [x(y) for x, y in zip(locus_types, line.strip().split('\t'))])))
                except:
                    print("parsing line failed: " + line)
                line = next(f)

            # Read peptides
            peptides = []
            while line[0] in ['\t', '*'] and line != '\tProteins\tPeptide IDs\tSpectra\n':
                peptides.append(
                    dict(zip(peptide_columns, [x(y) for x, y in zip(peptide_types, line.rstrip().split('\t'))])))
                line = next(f)

            protein['peptides'] = peptides
            protein['loci'] = loci

            # Parse out reverse loci
            for l in protein['loci']:
                if l['Locus'].startswith('Reverse_'):
                    l['reverse'] = True
                    l['Locus'] = l['Locus'][8:]
                else:
                    l['reverse'] = False
                if l['Locus'].count("||"):
                    locus_split = l['Locus'].split('||')
                    l['description'] = '||'.join(locus_split[1:])
                    l['Locus'] = int(locus_split[0])
                else:
                    l['description'] = l['Locus']
                    l['Locus'] = int(l['Locus'])

            # Are all loci for a protein reverse?
            ## - shouldn't this logic be changed to: 'are any loci reverse?' and if so, set 'Reverse' to True?
            ## (because then we can't distinguish this peptide match from a fictional protein from a real protein)
            ## - There aren't typically many overlapping peptides between forward and reverse proteins anyway (~1%)
            if all([l['reverse'] for x in protein['loci']]):
                protein['reverse'] = True
            else:
                protein['reverse'] = False

            if not return_reverse and protein['reverse']:  # skip this one if we are skipping reverse loci
                continue

            # Pull out peptide sequences
            for p in protein['peptides']:
                p['aa_sequence'] = re.findall('\.(.*)\.', p['Sequence'])[0]
                p['is_modified'] = True if ')' in p['aa_sequence'] else False
                if p['is_modified']:
                    seq, mods = get_unmod_seq(p['aa_sequence'])                 
                    p.update(mods)
                else:
                    p['unmod_peptide'] = p['aa_sequence']

                # MudPIT salt step (chromatography method from Xcalibur)
                p['lc_step'] = get_lcstep(p['FileName'])

                # Scan number from instrument (unique per salt step) - from MS2 / SQT file
                p['scan'] = int(p['FileName'].split('.')[1])

                # predicted ion charge from instrument - from MS2 / SQT file
                p['charge_state'] = int(p['FileName'].split('.')[3])

                # To try to not break things
                p['LCStep'] = p['lc_step']
                p['Scan'] = p['scan']
                p['ChargeState'] = p['charge_state']
                p['AA_Sequence'] = p['aa_sequence']
                p['isModified'] = p['is_modified']

            protein['peptide_seq'] = list(set((x['aa_sequence'] for x in protein['peptides'])))
            protein['unmod_peptide_seq'] = list(set((x['unmod_peptide'] for x in protein['peptides'])))
            protein['quantification'] = sum([x['Redundancy'] for x in protein['peptides']])
            protein['forward_loci'] = [l['Locus'] for l in protein['loci'] if not l['reverse']]
            protein['all_loci'] = [l['Locus'] for l in protein['loci']]
            
            def is_good_db(s):
                s = s.lower()
                if "refseq" in s or "uniprot" in s or s=="hmp_reference_genomes":
                    return True
                else:
                    return False

            # get a "representative" locus
            if protDB:
                # pick the largest protein in one of dbs: ['RefSeq','UniProt*', 'HMP_Reference_Genomes']
                p_result = [x for x in protDB.find({'_id':{'$in':protein['forward_loci']}}) if is_good_db(x['r'])]
                if p_result:
                    protein['name'] = max([(len(p['s']),p['d']) for p in p_result], key=lambda x:x[0])[1]
            if 'name' not in protein:                    
                # the largest one in any db
                max_length = max(l['Length'] for l in protein['loci'])
                protein['name'] = [l['description'] for l in protein['loci'] if l['Length'] == max_length][0]
            
            if get_tax:
                # get all possible taxIDs
                protDB_ids = protein['forward_loci']
                assert all(isinstance(x, int) for x in protDB_ids)
                taxIDs_doc = list(taxDB.aggregate(
                    [{'$match': {'_id': {'$in': protDB_ids}}},
                     {'$group': {'_id': None, 'taxid': {'$addToSet': '$taxid'}}}]))
                if taxIDs_doc:
                    protein['tax_id'] = [x for x in taxIDs_doc[0]['taxid'] if x]
                    protein['lca'] = t.LCA(taxIDs_doc[0]['taxid'])
                else:
                    protein['tax_id'] = []
                    protein['lca'] = None
                # To try to not break things
                protein['LCA'] = protein['lca']

            if check_peptides:
                # Are all peptides found within the fasta sequences for all possible forward_loci ?
                # Skip reverse loci. May want to change this to use all loci, regardless of forward or reverse
                # to avoid some proteins not having these entries.
                # Keeping like this for now to keep compatibility with get_forward_loci lookup
                if protein['forward_loci']:
                    protein['protDB'] = list(protDB.find({'_id': {'$in': protein['forward_loci']}}))
                    defline, seq = zip(*[(x['d'], x['s']) for x in protein['protDB']])
                    protein['all_peptides_in_proteins'] = all(
                        [all([p in s for p in protein['peptide_seq']]) for s in seq])
                    if not protein['all_peptides_in_proteins']:
                        print('not all peptides in proteins' + str(protein['forward_loci'][0]))

            if get_hashes:
                protein['hashes'] = [x['_id'] for x in redunDB.find({'pID': {'$in': protein['forward_loci']}})]

            if small:
                protein['descriptions'] = [l['description'] for l in protein['loci']]
                del protein['loci']
                del protein['peptides']

            yield protein
Exemple #13
0
# 3rd: query mongodb cluster
# sql = "select * from `analytics.1` limit 2"
#
# pipeline = [
#     {
#         '$sql': {
#             'statement': sql,
#             'format': "jdbc",
#             'dialect': "mysql",
#         }
#     }
# ]
#
# r = conn.aggregate(pipeline)
# pprint(list(r))

# 4th: query S3 via atlas data lake
sql = "select * from `clickstream` limit 2"

pipeline = [{
    '$sql': {
        'statement': sql,
        'format': "jdbc",
        'dialect': "mysql",
    }
}]

r = conn.aggregate(pipeline)
pprint(list(r))
Exemple #14
0
from pymongo import MongoClient
import pprint
from pymongo import ASCENDING
db = MongoClient().get_database("DATA").get_collection("Twitter_Breixt_9month")

cur = db.aggregate([{ "$group": { "_id": { "id": "$id" },
                            "uniqueIds": { "$addToSet": "$_id" },
                            "count": { "$sum": 1 } } },
              { "$match": { "count": { "$gt": 1 } } }], allowDiskUse=True)

duplicateIds = list(cur)

pprint.pprint(duplicateIds)
raw_input("Any button to remove")

for doc in duplicateIds:
    index = 1
    print doc["uniqueIds"]
    while index < doc["uniqueIds"].length:
        db.delete_one(doc["uniqueIds"][index])
        index += 1
    print index
    print


print db.createIndex({"id":ASCENDING},unique=True)
print "Done"
    }
}, {
    "$match": {
        "numPublications": {
            "$gt": 1
        }
    }
}, {
    "$sort": {
        "numPublications": 1
    }
}, {
    "$limit": 2
}]

pprint.pprint(list(article.aggregate(pipeline_abstracts)))

# journal ranking by number of publications

print("Nombre de publications par journal")
pipeline_journal = [{
    "$group": {
        "_id": {
            "journal": "$journal"
        },
        "numPublications": {
            "$sum": 1
        }
    }
}, {
    "$match": {
Exemple #16
0
    indices_file = "indices_" + brexit + ".json"
    graph = "g_" + brexit + ".graphml"


if False:
    limit = 2000

    hashtag_key = Status.SCHEMA_MAP[schema_id]["hashtags"]
    top_user_query = [
        {"$match": {Status.SCHEMA_MAP[schema_id]["retweeted_status"]: {"$exists": False}, "lang": "en"}},
        {"$unwind": "$" + hashtag_key},
        {"$group": {"_id": {"$toLower": '$' + hashtag_key + '.' + 'text'}, "count": {"$sum": 1}}},
        {"$sort": {"count": -1}},
        {"$limit": limit}]

    top_hastags = db_col.aggregate(top_user_query, allowDiskUse=True)
    a = list(top_hastags)
    print a

    with open(top_hashtag_file, "w") as f:
        json.dump(a, f)
elif False:
    top_hasthag_count = json.load(open(top_hashtag_file))
    top_hasthag_count = sorted(top_hasthag_count, key=lambda x: x["count"], reverse=True)
    top_hashtags = set([i["_id"] for i in top_hasthag_count if i["_id"]])
    # not in ["ivoted", "brexit"]])
    print top_hasthag_count
    print top_hashtags

    n = len(top_hashtags)
    coocurences_array = np.zeros([n,n], dtype=int)
Exemple #17
0
'''il primo  il nome del campo che viene visualizzato. Il secondo  il nome del campo che si prende
dalla collezione entrante'''

doc.add('year', 'year')
doc.add('make', 'make')
group1 = GroupDocument(doc)
group1.addsum('numberOfProducedModels', 1)

doc2 = Document()
doc2.add('year', '_id.year')
doc2.add('numberOfProducedModels', 'numberOfProducedModels')
group2 = GroupDocument(doc2)
group2.addpush('makers', '_id.make')

sort = SortDocument()
limit = LimitDocument(10)
sort.addfield('_id.numberOfProducedModels')

out = OutDocument('outcollpycharmsortedlimit')

agg.append(group1)
agg.append(group2)
agg.append(limit)
agg.append(sort)
agg.append(out)


print agg

print json.dumps(coll.aggregate(agg), indent=4)
'''
from pymongo import MongoClient

if __name__ == '__main__':
    c = MongoClient('mongodb://localhost:27017').logs.nginx
    print(f'{c.count_documents({})} logs')
    print('Methods:')
    print(f'\tmethod GET: {c.count_documents({"method": "GET"})}')
    print(f'\tmethod POST: {c.count_documents({"method": "POST"})}')
    print(f'\tmethod PUT: {c.count_documents({"method": "PUT"})}')
    print(f'\tmethod PATCH: {c.count_documents({"method": "PATCH"})}')
    print(f'\tmethod DELETE: {c.count_documents({"method": "DELETE"})}')
    print(f'{c.count_documents({"path": "/status"})} status check')
    print('IPs:')
    ips = c.aggregate([{
        '$group': {
            '_id': '$ip',
            'count': {
                '$sum': 1
            }
        }
    }, {
        '$sort': {
            'count': -1
        }
    }, {
        '$limit': 10
    }])
    for ip in ips:
        print(f'\t{ip.get("_id")}: {ip.get("count")}')
     tomorrow = tomorrow - dt.timedelta(days=1)
     today = tomorrow - dt.timedelta(days=1)
     name = f'news/{today.year}-{0 if today.month < 10 else ""}{today.month}-{0 if today.day < 10 else ""}{today.day}.npy'
     if not os.path.exists(name):
         print(today, tomorrow)
         # ms = messages.find({'_date': {'$gte': today, '$lt': tomorrow}, 'elmo': {'$exists': True}})
         ms = messages.aggregate([
             {
                 '$match': {
                     '_date': {
                         '$gte': today,
                         '$lt': tomorrow
                     },
                     'elmo': {
                         '$exists': True
                     }
                 }
             },
             # {'$match': {'elmo': {'$exists': True}}},
             {
                 '$sample': {
                     'size': 64
                 }
             }
         ])
         elmo = [pickle.loads(m['elmo']) for m in ms]
         if not elmo:
             continue
         elmo = np.stack(elmo).astype(np.float32)
         np.save(name, elmo)
 # t0 = time.time()
Exemple #20
0
def dta_select_parser(in_file,
                      small=False,
                      get_tax=False,
                      check_peptides=False,
                      get_hashes=False,
                      return_reverse=True,
                      taxDB=None,
                      protDB=None):
    """ steps through and parses a DTASelect-filter.txt file (generator function)
    :param in_file: path to DTASelect-filter.txt file
    :param small: get rid of keys: 'loci' and 'peptides'
    :param get_tax: Look up taxonomy information for each protDB ID
    :param check_peptides: Check if all peptide sequences are in all proteins sequences in 'forward_loci'
    :param get_hashes:
    :param return_reverse: include reverse loci
    :pararm taxDB: mongo collection to the taxDB (containing protdbID -> taxid mapping) or None
    :param protDB: mongo collection to protDB (protdbID information). If protDB is given, a better protein['name'] will be attempted
    :type in_file: str

        get_forward_loci is removed
        noProtDB is removed. If the locus contains a "||", the part before the pipes is determined to be the protID

    protein is a dict with (possible) fields:
        loci: list of locus dicts. described below
        peptides: list of peptide dicts. described below
        reverse: boolean. True if all loci for a protein are reverse
        peptide_seq: set. All peptide amino acid sequences
        forward_loci: list. 'Locus' fields in loci for forward loci
        all_loci: list. 'Locus' fields in loci
        name: string. The "representative" locus (the largest one by AA length)
        tax_id: list. Unique list of taxonomy IDs for all forward loci
        lca: Int or None. Lowest common ancestor of tax_ids
        hashes: list. MD5sums of protein sequences for forward_loci. len(hashes) gives the number of unique proteins matching

    loci: dict parsed from Loci lines in file. Mostly unchanged
        fields from file: Locus, Sequence Count, Spectrum Count, Sequence Coverage, Length, MolWt, pI, Validation Status, NSAF, EMPAI, Descriptive Name
        fields added:
            reverse: boolean. True if `locus` starts with "Reverse_"
            description: part after'||', if exists
    peptides: dict parsed from peptide lines in file. Mostly unchanged.
        fields from file: Unique, FileName, XCorr, DeltCN, Conf%, M+H+m CalcM+H+, TotalIntensity, SpR, SpScore, IonProportion, Redundancy, Sequence
        fields added:
            aa_sequence: `Sequence` with the left and right sequence stripped
            is_modified: boolean. True if the peptide has PTMs
            unmod_peptide: peptide sequence without PTMs
            diff_mass: mass difference of PTMs
            mods: list of tuples: (AA (amino acid that is modified), pos (1-based position within peptide), mass (mass of this PTM))
            lc_step, scan, charge_state:  parsed from `FileName`

    """
    def per_to_float(x):
        # '50%' -> 0.50
        return float(x[:-1]) / 100

    if get_tax:
        from ..analysis import taxonomy
        from pymongo import MongoClient
        if not taxDB:
            taxDB = MongoClient('wl-cmadmin.scripps.edu',
                                27017).TaxDB_072114.TaxDB_072114
        t = taxonomy.Taxonomy(host='wl-cmadmin.scripps.edu', port=27017)

    if check_peptides:
        protDB = MongoClient('wl-cmadmin.scripps.edu',
                             27018).ProtDB_072114.ProtDB_072114

    if get_hashes:
        client = MongoClient('wl-cmadmin.scripps.edu', 27017)
        redunDB = client.redunDB.redunDB

    locus_types = [
        str, int, int, per_to_float, int, int, float, str, float, float, str
    ]
    peptide_types = [
        str, str, float, float, float, float, float, float, int, float, float,
        int, str
    ]
    peptide_types_ppm = [
        str, str, float, float, float, float, float, float, float, int, float,
        float, int, str
    ]
    peptide_types_pI = [
        str, str, float, float, float, float, float, float, int, float, float,
        float, int, str
    ]
    peptide_types_ppm_pI = [
        str, str, float, float, float, float, float, float, float, int, float,
        float, float, int, str
    ]
    with open(in_file) as f:
        line = next(f)

        # Skip header.
        # TODO: parse it

        while not line.startswith('Locus'):
            line = next(f)
            if "--dm" in line and "--pI" in line:
                peptide_types = peptide_types_ppm_pI
            elif "--dm" in line:
                peptide_types = peptide_types_ppm
            elif "--pI" in line:
                peptide_types = peptide_types_pI

        # Read locus header
        locus_columns = line.strip().split('\t')

        # Read peptide header
        line = next(f)
        peptide_columns = line.rstrip().split('\t')
        line = next(f)

        # Read the rest of the file
        while line != '\tProteins\tPeptide IDs\tSpectra\n':
            protein = dict()
            # Read loci for a protein
            loci = []
            while not line.startswith('\t') and not line.startswith(
                    '*'):  # While it starts with a number or "Rev'
                try:
                    loci.append(
                        dict(
                            zip(locus_columns, [
                                x(y) for x, y in zip(locus_types,
                                                     line.strip().split('\t'))
                            ])))
                except:
                    print("parsing line failed: " + line)
                line = next(f)

            # Read peptides
            peptides = []
            while line[0] in [
                    '\t', '*'
            ] and line != '\tProteins\tPeptide IDs\tSpectra\n':
                peptides.append(
                    dict(
                        zip(peptide_columns, [
                            x(y) for x, y in zip(peptide_types,
                                                 line.rstrip().split('\t'))
                        ])))
                line = next(f)

            protein['peptides'] = peptides
            protein['loci'] = loci

            # Parse out reverse loci
            for l in protein['loci']:
                if l['Locus'].startswith('Reverse_'):
                    l['reverse'] = True
                    l['Locus'] = l['Locus'][8:]
                else:
                    l['reverse'] = False
                if l['Locus'].count("||"):
                    locus_split = l['Locus'].split('||')
                    l['description'] = '||'.join(locus_split[1:])
                    l['Locus'] = int(locus_split[0])
                else:
                    l['description'] = l['Locus']
                    try:
                        l['Locus'] = int(l['Locus'])
                    except ValueError:
                        l['Locus'] = 0

            # Are all loci for a protein reverse?
            ## - shouldn't this logic be changed to: 'are any loci reverse?' and if so, set 'Reverse' to True?
            ## (because then we can't distinguish this peptide match from a fictional protein from a real protein)
            ## - There aren't typically many overlapping peptides between forward and reverse proteins anyway (~1%)
            if all([l['reverse'] for x in protein['loci']]):
                protein['reverse'] = True
            else:
                protein['reverse'] = False

            if not return_reverse and protein[
                    'reverse']:  # skip this one if we are skipping reverse loci
                continue

            # Pull out peptide sequences
            for p in protein['peptides']:
                p['aa_sequence'] = re.findall('\.(.*)\.', p['Sequence'])[0]
                p['is_modified'] = True if ')' in p['aa_sequence'] else False
                if p['is_modified']:
                    seq, mods = get_unmod_seq(p['aa_sequence'])
                    p.update(mods)
                else:
                    p['unmod_peptide'] = p['aa_sequence']

                # MudPIT salt step (chromatography method from Xcalibur)
                p['lc_step'] = get_lcstep(p['FileName'])

                # Scan number from instrument (unique per salt step) - from MS2 / SQT file
                p['scan'] = int(p['FileName'].split('.')[1])

                # predicted ion charge from instrument - from MS2 / SQT file
                p['charge_state'] = int(p['FileName'].split('.')[3])

                # To try to not break things
                p['LCStep'] = p['lc_step']
                p['Scan'] = p['scan']
                p['ChargeState'] = p['charge_state']
                p['AA_Sequence'] = p['aa_sequence']
                p['isModified'] = p['is_modified']

            protein['peptide_seq'] = list(
                set((x['aa_sequence'] for x in protein['peptides'])))
            protein['unmod_peptide_seq'] = list(
                set((x['unmod_peptide'] for x in protein['peptides'])))
            protein['quantification'] = sum(
                [x['Redundancy'] for x in protein['peptides']])
            protein['forward_loci'] = [
                l['Locus'] for l in protein['loci'] if not l['reverse']
            ]
            protein['all_loci'] = [l['Locus'] for l in protein['loci']]

            def is_good_db(s):
                s = s.lower()
                if "refseq" in s or "uniprot" in s or s == "hmp_reference_genomes":
                    return True
                else:
                    return False

            # get a "representative" locus
            if protDB:
                # pick the largest protein in one of dbs: ['RefSeq','UniProt*', 'HMP_Reference_Genomes']
                p_result = [
                    x for x in protDB.find(
                        {'_id': {
                            '$in': protein['forward_loci']
                        }}) if is_good_db(x['r'])
                ]
                if p_result:
                    protein['name'] = max([(len(p['s']), p['d'])
                                           for p in p_result],
                                          key=lambda x: x[0])[1]
            if 'name' not in protein:
                # the largest one in any db
                max_length = max(l['Length'] for l in protein['loci'])
                protein['name'] = [
                    l['description'] for l in protein['loci']
                    if l['Length'] == max_length
                ][0]

            if get_tax:
                # get all possible taxIDs
                protDB_ids = protein['forward_loci']
                assert all(isinstance(x, int) for x in protDB_ids)
                taxIDs_doc = list(
                    taxDB.aggregate([{
                        '$match': {
                            '_id': {
                                '$in': protDB_ids
                            }
                        }
                    }, {
                        '$group': {
                            '_id': None,
                            'taxid': {
                                '$addToSet': '$taxid'
                            }
                        }
                    }]))
                if taxIDs_doc:
                    protein['tax_id'] = [
                        x for x in taxIDs_doc[0]['taxid'] if x
                    ]
                    protein['lca'] = t.LCA(taxIDs_doc[0]['taxid'])
                else:
                    protein['tax_id'] = []
                    protein['lca'] = None
                # To try to not break things
                protein['LCA'] = protein['lca']

            if check_peptides:
                # Are all peptides found within the fasta sequences for all possible forward_loci ?
                # Skip reverse loci. May want to change this to use all loci, regardless of forward or reverse
                # to avoid some proteins not having these entries.
                # Keeping like this for now to keep compatibility with get_forward_loci lookup
                if protein['forward_loci']:
                    protein['protDB'] = list(
                        protDB.find({'_id': {
                            '$in': protein['forward_loci']
                        }}))
                    defline, seq = zip(*[(x['d'], x['s'])
                                         for x in protein['protDB']])
                    protein['all_peptides_in_proteins'] = all([
                        all([p in s for p in protein['peptide_seq']])
                        for s in seq
                    ])
                    if not protein['all_peptides_in_proteins']:
                        print('not all peptides in proteins' +
                              str(protein['forward_loci'][0]))

            if get_hashes:
                protein['hashes'] = [
                    x['_id'] for x in redunDB.find(
                        {'pID': {
                            '$in': protein['forward_loci']
                        }})
                ]

            if small:
                protein['descriptions'] = [
                    l['description'] for l in protein['loci']
                ]
                del protein['loci']
                del protein['peptides']

            yield protein
Exemple #21
0
from datetime import timedelta

db_col = MongoClient().get_database("DATA").get_collection("Brexit_old")


query = [
    {"$match":{"retweetedStatus":{"$exists":True}}},
    {"$group":{"_id":{"id":"$retweetedStatus.id","date":"$retweetedStatus.createdAt",
                      "name":"$retweetedStatus.user.screenName"},
               "retweets":{"$push":"$createdAt"}}}
]
# {"$match":{"retweets.length":{"$gte":2}}}


cursor = db_col.aggregate(query, allowDiskUse = True)
l = {}

#
for i,c in enumerate(cursor):
    retweets = c["retweets"]
    if len(retweets) < 10:
        continue
    ll = []
    bb = {}
    l[c["_id"]["id"]] = bb
    original_date = c["_id"]["date"]
    for d in retweets:
        dif = d - original_date
        # ll.append(dif)
        dif_seconds =(divmod(dif.seconds, 60)[0])
Exemple #22
0
Created on Fri Jun  3 15:23:06 2016

@author: xinruyue
"""
from pymongo import MongoClient
import sys

reload(sys)
sys.setdefaultencoding('utf8')

userAttr = MongoClient("10.8.8.111:27017")['cache']['userAttr']

#user "from" to split data 
platform = ['teacher','mobile','ios','pc','android']
#get loc data
loc_data = []
for each in platform:
    pipeline = [
    {"$match":{"from":each}},
    {"$group":{"_id":"None","location":{"$push":"$location"}}}]
    
    loc_data += list(userAttr.aggregate(pipeline))[0]['location']
print len(loc_data)

#save data as csvfile
with open ("loc_data.csv",'w') as ld:
    for each in loc_data:
        if each != None:
            ld.write(each + '\n')
ld.close()
Exemple #23
0
        count = get_entity_count(argv[2])
        current_highest_count = get_highest_count()

        if count == current_highest_count:
            if list(
                    db.aggregate([{
                        '$match': {
                            'entities.count': current_highest_count
                        }
                    }, {
                        '$project': {
                            "entities": {
                                '$filter': {
                                    "input": "$entities",
                                    "as": "entity",
                                    "cond": {
                                        '$eq': [
                                            '$$entity.count',
                                            current_highest_count
                                        ]
                                    }
                                }
                            },
                            '_id': 0
                        }
                    }]))[0]['entities'].__len__() == 1:
                print("I can't let you do this, mortal.")
                exit()

        db.update_one({"entities.name": argv[2]},
                      {'$inc': {
    VIPUsers = list(users.aggregate(pipeline))[0]['user']
    return VIPUsers


startTime = datetime.datetime(2016,6,9,16)
userId = vip_users(startTime)

print len(userId)

location = []
for each in userId:
    pipeline = [
    {"$match":{"user":each}},
    {"$group":{"_id":None,"loc":{"$push":"$location"}}}]
    
    result = list(userAttr.aggregate(pipeline))
    if len(result) == 0:
        print each
    else:
        location += result[0]['loc']
        
print len(location)

csvfile = file('VIPUser_map_1.csv','wb')
writer = csv.writer(csvfile)

location.sort()

location_1 = []
map_data = {}
writer.writerow(['loc','num'])
Exemple #25
0
doc = Document()
'''il primo  il nome del campo che viene visualizzato. Il secondo  il nome del campo che si prende
dalla collezione entrante'''

doc.add('year', 'year')
doc.add('make', 'make')
group1 = GroupDocument(doc)
group1.addsum('numberOfProducedModels', 1)

doc2 = Document()
doc2.add('year', '_id.year')
doc2.add('numberOfProducedModels', 'numberOfProducedModels')
group2 = GroupDocument(doc2)
group2.addpush('makers', '_id.make')

sort = SortDocument()
limit = LimitDocument(10)
sort.addfield('_id.numberOfProducedModels')

out = OutDocument('outcollpycharmsortedlimit')

agg.append(group1)
agg.append(group2)
agg.append(limit)
agg.append(sort)
agg.append(out)

print agg

print json.dumps(coll.aggregate(agg), indent=4)