Ejemplo n.º 1
0
def merge(mongo_collection=None, drop=True):
    ## merge docs
    if mongo_collection:
        mydisease = mongo_collection
    else:
        client = MongoClient()
        mydisease = client.mydisease.mydisease
    if drop:
        mydisease.drop()

    g = build_id_graph()

    # make initial primary d with all DOID docs
    db = MongoClient().mydisease.disease_ontoloy
    d = [{'_id': doc['_id'], 'disease_ontology': doc} for doc in db.find()]
    mydisease.insert_many(d)

    # fill in from other sources
    for db_name in tqdm(set(db_names) - {'disease_ontoloy'}):
        print(db_name)
        db = MongoClient().mydisease[db_name]
        if db.count() == 0:
            print("Warning: {} is empty".format(db))
        for doc in db.find():
            doids = get_equiv_doid(g, doc['_id'])
            for doid in doids:
                mydisease.update_one({'_id': doid}, {'$push': {db_name: doc}}, upsert=True)
Ejemplo n.º 2
0
 def __init__(self, page, username):
     posts = MongoClient().blog.Aritical.find({
         'username': username
     }).sort('issuing_time', DESCENDING)
     self.total = posts.count()
     self.pages = int(self.total / 20)
     if self.total % 20 != 0:
         self.pages += 1
     if page == 1:
         self.has_prev = False
     else:
         self.has_prev = True
     if page == self.pages:
         self.has_next = False
     else:
         self.has_next = True
     self.next_num = page + 1
     self.page = page
     self.per_page = 20
     self.prev_num = page - 1
     self.current_num = self.total - (20 * (page - 1))
     if self.current_num > 20:
         self.current_num = 20
     self.item = []
     for i in range(self.current_num):
         self.item.append(posts[self.prev_num * 20 + i])
Ejemplo n.º 3
0
def run(host=None, db=None, coll=None, node=None, outgoing="true", incoming="true", undirected="true"):
    # Connect to the mongo collection.
    graph = MongoClient(host)[db][coll]

    outgoing = json.loads(outgoing)
    incoming = json.loads(incoming)
    undirected = json.loads(undirected)

    # Construct the query according to the given options.
    query = {"type": "link"}
    clauses = []
    oid = ObjectId(node)
    if outgoing or incoming:
        dirclauses = []
        orclause = {"$or": [{"undirected": {"$not": {"$exists": 1}}},
                            {"undirected": False}]}
        if outgoing:
            dirclauses.append({"source": oid})

        if incoming:
            dirclauses.append({"target": oid})

        clauses.append({"$and": [orclause, {"$or": dirclauses}]})

    if undirected:
        clauses.append({"$and": [{"undirected": True},
                                 {"$or": [{"source": oid},
                                          {"target": oid}]}]})

    query["$or"] = clauses

    return json.dumps(graph.count(query))
Ejemplo n.º 4
0
class MongoIterator(object):

    def __init__(self, uri, db, collection, skip=0, limit=0, filter=None):
        self._collection = MongoClient(uri)[db][collection]
        self._skip = skip
        self._limit = limit
        self._filter = filter

    def __iter__(self):
        return self.stream()

    def stream(self, conditions=None, projection=None, skip=None, limit=None):
        proj = {k: 1 for k in projection} if projection else {}

        if proj:
            proj.update({'_id': False})  # skip internal id

        return self._collection.find(conditions or self._filter, proj or None, skip=skip or self._skip, limit=limit or self._limit)

    def size(self):
        return self._collection.count() if not self._filter else self._collection.find(self._filter).count()

    @property
    def filter(self):
        return self._filter

    @filter.setter
    def filter(self, conditions):
        self._filter = conditions
Ejemplo n.º 5
0
def upload(source_json,
           source,
           db_name=DB_NAME,
           coll_name=VERBS,
           drop=False,
           indices=(VERB, PARADIGM)):
    target = MongoClient(LOCALHOST, PORT)[db_name][coll_name]
    if drop:
        target.drop()
    print('Initially,', target.count(), 'entries')
    count = counter()
    for line in read_json_lines(source_json):
        next(count)
        line[SOURCE] = source
        target.insert(line)
    add_indices(target, indices)
    print('\nCurrently,', target.count(), 'entries')
Ejemplo n.º 6
0
class TvrainData:
    def __init__(self):
        """
        Just load data from Mongo.
        """
        self.sequences = MongoClient(
            os.environ['MONGODB_URL']).tvrain.sequences
        self.collection = MongoClient(
            os.environ['MONGODB_URL']).tvrain.articles
        self.collection.create_index("time")

    def get_random_articles(self, n):
        """Returns N of topics for index.html"""
        articles = self.collection.find().sort("time", 1).skip(
            random.randint(0, self.collection.count())).limit(n)
        return list(articles)

    def get_article_id(self, url):
        """Get id by url"""
        return self.collection.find_one({'url': url})['_id']

    def get_articles_data(self, articles_urls):
        """
        Get data from MongoDB for articles urls
        :param articles_urls: ['article_url', ...]
        :return: list of MongoDB documents
        """
        articles = []
        for url in articles_urls:
            articles.append(self.collection.find_one({'url': url}))
        return articles

    def iterate_articles(self,
                         except_articles,
                         skip=0,
                         limit=None,
                         query=None):
        """
        Iteate throw all articles without ids of except articles
        :param except_articles: list of ids
        :return:
        """
        if query is None:
            query = {}
        if limit is None:
            data = self.collection.find(query).skip(skip)
        else:
            data = self.collection.find(query).skip(skip).limit(limit)

        for value in data:
            if value['_id'] not in except_articles:
                yield value

    def get_sequences(self):
        """Return all sequences for train"""
        return list(self.sequences.find().limit(-1))
Ejemplo n.º 7
0
def merge_one(db_name):
    mydisease = MongoClient().mydisease.mydisease
    g = build_id_graph()
    db = MongoClient().mydisease[db_name]
    if db.count() == 0:
        print("Warning: {} is empty".format(db))
    for doc in db.find():
        doids = get_equiv_doid(g, doc['_id'])
        for doid in doids:
            mydisease.update_one({'_id': doid}, {'$push': {db_name: doc}}, upsert=True)
Ejemplo n.º 8
0
Archivo: G8.py Proyecto: lum4chi/IR
    class BigramsCorpus:
        def __init__(self, db, collection):
            self.client = MongoClient()[db][collection]

        def __iter__(self):
            for doc in self.client.find():
                yield [doc['_id']]

        def __len__(self):
            return self.client.count()
Ejemplo n.º 9
0
 def __init__(self, page, show_follow):
     if show_follow == 0:
         posts = MongoClient().blog.Aritical.find().sort(
             'issuing_time', DESCENDING)
         self.total = posts.count()
         self.posts = posts
     if show_follow == 1:
         self.posts = []
         following = MongoClient().blog.User.find_one({
             'username':
             current_user.username
         }).get('following')
         artical = MongoClient().blog.Aritical.find().sort(
             'issuing_time', DESCENDING)
         # following.append([current_user.username, 'date'])
         for i in range(following.__len__()):
             for x in range(artical.count()):
                 if following[i][0] == artical[x].get('username'):
                     self.posts.append(artical[x])
                     self.posts.sort(key=lambda x: x.get('issuing_time'),
                                     reverse=True)
         self.total = self.posts.__len__()
     self.pages = int(self.total / 20)
     if self.total % 20 != 0:
         self.pages += 1
     if page == 1:
         self.has_prev = False
     else:
         self.has_prev = True
     if page == self.pages:
         self.has_next = False
     else:
         self.has_next = True
     self.next_num = page + 1
     self.page = page
     self.per_page = 20
     self.prev_num = page - 1
     self.current_num = self.total - (20 * (page - 1))
     if self.current_num > 20:
         self.current_num = 20
     self.item = []
     for i in range(self.current_num):
         self.item.append(self.posts[self.prev_num * 20 + i])
Ejemplo n.º 10
0
def run(host=None,
        db=None,
        coll=None,
        node=None,
        outgoing="true",
        incoming="true",
        undirected="true"):
    # Connect to the mongo collection.
    graph = MongoClient(host)[db][coll]

    outgoing = json.loads(outgoing)
    incoming = json.loads(incoming)
    undirected = json.loads(undirected)

    # Construct the query according to the given options.
    query = {"type": "link"}
    clauses = []
    oid = ObjectId(node)
    if outgoing or incoming:
        dirclauses = []
        orclause = {
            "$or": [{
                "undirected": {
                    "$not": {
                        "$exists": 1
                    }
                }
            }, {
                "undirected": False
            }]
        }
        if outgoing:
            dirclauses.append({"source": oid})

        if incoming:
            dirclauses.append({"target": oid})

        clauses.append({"$and": [orclause, {"$or": dirclauses}]})

    if undirected:
        clauses.append({
            "$and": [{
                "undirected": True
            }, {
                "$or": [{
                    "source": oid
                }, {
                    "target": oid
                }]
            }]
        })

    query["$or"] = clauses

    return json.dumps(graph.count(query))
Ejemplo n.º 11
0
class Stat:
    def __init__(self, config_file):
        self.docker = dockerGuest(config_file)
        self.config_file = config_file
        self.collection = MongoClient(os.environ['DB_PORT_27017_TCP_ADDR'],
                                      27017)['test'][self.get_collection()]

    def get_collection(self):
        dir_path = os.path.dirname(os.path.realpath(__file__))
        file_path = os.path.join(dir_path, self.config_file)
        return json.load(open(file_path))["collection"]

    def get_config_file(self):
        return self.config_file

    @staticmethod
    def get_time():
        time = datetime.now()
        return time
        #return time.strftime('%H:%M:%S')

    def data_to_save(self):
        stats = self.docker.get_stats()
        if stats == None:
            return None
        global_stat_dict = {}
        global_stat_dict["time"] = Stat.get_time()
        global_stat_dict["stats"] = stats
        return global_stat_dict

    def save(self, cap=60):
        new_data = self.data_to_save()
        if self.collection.count() == cap:
            top_doc_time = min(doc['time'] for doc in self.collection.find())
            self.collection.delete_one({'time': top_doc_time})
        self.collection.insert_one(new_data)
        logger.info("Saved in DB...")

    def save_to_db(self):
        data = self.data_to_save()

        if data != None:
            if self.is_db_full():
                self.make_space_db()
            logger.info('DB Save')
            self.collection.insert_one(data)

    def make_space_db(self):
        logger.info('Making space')
        self.collection.delete_one({'_id': self.collection.find()[0]['_id']})

    def is_db_full(self):
        if self.collection.find({}).count() == 60:
            return True
        return False
Ejemplo n.º 12
0
class GraphData(object):
    _instance = None
    _instance_lock = Lock()

    host = '101.132.40.25'
    port = 27017

    def __init__(self):
        self.table = MongoClient(host=GraphData.host,
                                 port=GraphData.port).get_database(
                                     'judging').get_collection('graph')

    def __new__(cls, *args, **kwargs):
        """
        singleton

        > Multiple `GraphData()` usage will return the same instance
        :param args:
        :param kwargs:
        :return:
        """
        if GraphData._instance is None:
            with GraphData._instance_lock:
                if GraphData._instance is None:
                    GraphData._instance = object.__new__(cls)
        return GraphData._instance

    def exists(self, graph_name: str) -> bool:
        return self.table.count({'_id': graph_name}) > 0

    def save(self, graph: dict) -> bool:
        """
        **Attention: this method will override the exist graph**
        :param graph:
        :return:
        """
        self.table.save({**{'_id': graph['名称']}, **graph})
        return True

    def fetch(self, graph_name: str) -> dict:
        return self.table.find_one({'_id': graph_name})

    def get_graph_list(self) -> List[str]:
        graph_list = []
        for i in self.table.find({}, {'名称': 1, '_id': 0}):
            graph_list.append(i['名称'])
        return graph_list

    def remove_graph(self, graph_name: str):
        self.table.remove(graph_name)
Ejemplo n.º 13
0
class TvrainData:
    def __init__(self):
        """
        Just load data from Mongo.
        """
        self.sequences = MongoClient(os.environ['MONGODB_URL']).tvrain.sequences
        self.collection = MongoClient(os.environ['MONGODB_URL']).tvrain.articles
        self.collection.create_index("time")

    def get_random_articles(self, n):
        """Returns N of topics for index.html"""
        articles = self.collection.find().sort("time", 1).skip(random.randint(0, self.collection.count())).limit(n)
        return list(articles)

    def get_article_id(self, url):
        """Get id by url"""
        return self.collection.find_one({'url': url})['_id']

    def get_articles_data(self, articles_urls):
        """
        Get data from MongoDB for articles urls
        :param articles_urls: ['article_url', ...]
        :return: list of MongoDB documents
        """
        articles = []
        for url in articles_urls:
            articles.append(self.collection.find_one({'url': url}))
        return articles

    def iterate_articles(self, except_articles, skip=0, limit=None, query=None):
        """
        Iteate throw all articles without ids of except articles
        :param except_articles: list of ids
        :return:
        """
        if query is None:
            query = {}
        if limit is None:
            data = self.collection.find(query).skip(skip)
        else:
            data = self.collection.find(query).skip(skip).limit(limit)

        for value in data:
            if value['_id'] not in except_articles:
                yield value

    def get_sequences(self):
        """Return all sequences for train"""
        return list(self.sequences.find().limit(-1))
def remove_morphologically_abnormal_verbs():
    abnormal_count = 0
    coll = MongoClient(LOCALHOST, PORT)[DB_NAME][VERBS]
    count = counter(coll.count())
    for entry in coll.find():
        next(count)
        verbs = entry[VERB]
        if isinstance(verbs, str):
            verbs = [verbs]
        for verb in verbs:
            if not (verb.endswith('ω') or verb.endswith('ώ')
                    or verb.endswith('αι')):
                coll.delete_one({VERB: verb})
                abnormal_count += 1
    print("\nRemoved {} abnormal verbs".format(abnormal_count))
Ejemplo n.º 15
0
 def setUpDb(self, host, port, db, collection):
     try:
         mongo_host = os.environ.get(
             host, os.environ.get("MONGO_HOST", "localhost"))
         mongo_port = os.environ.get(port, 27017)
         mongo_database = os.environ.get(
             db, "twitter_database")
         client = MongoClient(mongo_host, mongo_port)[
             mongo_database][collection]
         if collection == "twitter_collection-"+self.owner and client.count() == 0:
             raise Exception(
                 "There is no data in the source database: " + collection)
         return client
     except Exception as err:
         print("Error when connecting to SOURCE database: " + str(err))
         exit(2)
Ejemplo n.º 16
0
def print_verbs(fieldname, fltr, func=None, dbname=DB_NAME, collname=VERBS):
    match = MongoClient(LOCALHOST, PORT)[dbname][collname].find(fltr)
    total = match.count()
    if func:
        res = list()
        count = counter(total)
        for entry in match:
            next(count)
            if func(entry):
                res.append(entry[fieldname])
        print("\n{} matching items".format(len(res)))
    else:
        print(total, "matching entries")
        res = [entry[fieldname] for entry in match]
    for item in res:
        print(item)
Ejemplo n.º 17
0
class TextIO:
    def __init__(self):
        self.db = MongoClient('localhost', 20000).get_database('chinese').get_collection('train')

    def get_mongo_size(self):
        size = self.db.count()
        # print("size: %d" % size)
        return size

    def get_text_from_mongo(self, skip=0, limit=1, isRandom=True):
        size = self.get_mongo_size()
        if isRandom:
            skip = random.randint(0, size - limit)

        cursor = self.db.find().skip(skip).limit(limit)
        for doc in cursor:
            yield doc['text']
def collect_duplicates():
    visited = set()
    duplicates = set()
    coll = MongoClient(LOCALHOST, PORT)[DB_NAME][VERBS]
    count = counter(coll.count())
    for entry in coll.find():
        next(count)
        verbs = entry[VERB]
        if isinstance(verbs, str):
            verbs = [verbs]
        for verb in verbs:
            if verb in visited:
                duplicates.add(verb)
            else:
                visited.add(verb)
    print("\nDumping {} duplicates".format(len(duplicates)))
    dump_utf_json(sorted(list(duplicates)), DUPLICATES_JSON)
Ejemplo n.º 19
0
class Mongodb:
    def __init__(self):
        self.collection = MongoClient()['db_name']['collection_name']

    def count(self):
        return self.collection.count()

    def find_page(self, pager, query=None):
        if pager.is_pre_half:
            result = list(
                self.collection.find(query).skip(pager.offset).limit(
                    pager.page_size))
        else:
            result = list(self.collection.find(query) \
                          .sort([('_id', -1)]).skip(0 if pager.is_last else pager.residue) \
                          .limit(min(pager.page_size, pager.residue if pager.is_last else pager.page_size)))[::-1]
        return result
Ejemplo n.º 20
0
def status(client, db, cell, example):
    '''
    \b
    - list cells, num_entries
    - verbose: find_one() in each cell but truncate sequence field before print
    - include .zoo metadata in the report in the future

    Example:

    \b
    zoo status --db diff --cell mock --example
    '''
    c = MongoClient(client)[db][cell]
    print(c.count(), 'documents.\n')
    if example:
        print('Example:')
        print(json.dumps(c.find_one(), indent=2))
        print()
Ejemplo n.º 21
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("twitterUser", type=str, help="Twitter ID")
    parser.add_argument("-l", "--limit", type=int,
                        help="Limit of tweet that have to be scraped, tweet are retrieved in batches of 20, default: 20", default=20)
    args = parser.parse_args()
    mongo_host = os.environ.get("MONGO_HOST", "localhost")
    mongo_port = os.environ.get("MONGO_PORT", 27017)
    mongo_database = os.environ.get(
        "MONGO_TWITTER_DATABASE", "twitter_database")
    mongo_collection = "twitter_collection-" + args.twitterUser
    client = MongoClient(mongo_host, mongo_port)[
        mongo_database][mongo_collection]
    fetch({
        "twitterUser": args.twitterUser,
        "limit": args.limit,
        "mongoClient": client
    })  # Possibly return the collection of tweet in python format readable
    print("Number of tweets inserted in " +
          mongo_collection + ": " + str(client.count()))
Ejemplo n.º 22
0
class MongoKVStorage(KVStorage):
    def __init__(self, config):
        super().__init__(config)

        mongo_host = config['host']
        mongo_port = config['port']
        mongo_db_name = config['db']
        mongo_collection = config['collection']

        self._collection = MongoClient(host=mongo_host, port=mongo_port)[mongo_db_name][mongo_collection]

    def get(self, key):
        found_val = self._collection.find_one({'key': key})

        return found_val

    def set(self, key, value):
        self._collection.update_one({'key': key}, {'$set': value}, upsert=True)

    def exists(self, key):
        return self._collection.count({'key': key}) > 0
Ejemplo n.º 23
0
def main():
    model = Doc2Vec.load(Settings.MODEL_PATH + "doc2vec.model")
    """test_similarity([("right", "wrong"),
                     ("refresh", "cache")], model)"""

    reports_collection = MongoClient(Settings.MONGO_CONNECTION_STRING)[
        Settings.DOC2VEC_REPORTS_DATABASE][Settings.REPORTS_COLLECTION]
    duplicate_reports_collection = MongoClient(
        Settings.MONGO_CONNECTION_STRING)[Settings.DOC2VEC_REPORTS_DATABASE][
            Settings.DUPLICATE_REPORTS_COLLECTION]

    # get random target text that has a duplciate
    index = random.randint(0, duplicate_reports_collection.count() - 1)
    txt = duplicate_reports_collection.find_one({"reId": index})["dups"][0]
    txt = normalize_text(txt)
    print "Target Text: %s" % txt

    vec = model.infer_vector(txt)

    # print most similar documents
    for reId, p in model.docvecs.most_similar([vec], topn=10):
        print "%0.3f: \"%s\"" % (p, reports_collection.find_one({"reId": reId
                                                                 })["text"])
Ejemplo n.º 24
0
import json, sys
import progressbar
from collections import defaultdict
from pymongo import MongoClient
from gazouilloire.web.export import format_csv

with open('config.json') as confile:
    conf = json.loads(confile.read())

db = MongoClient(conf['mongo']['host'],
                 conf['mongo']['port'])[conf['mongo']['db']]['tweets']

langs = defaultdict(int)
query = {}
print "Counting matching results..."
count = db.count(query)

print "Querying and hashing results..."
bar = progressbar.ProgressBar(max_value=count)
for t in bar(db.find(query, limit=count, projection={"lang": 1, "_id": 0})):
    l = t.get("lang", "")
    langs[l] += 1

print "Sorting and storing csv data..."
with open("langs.csv", "w") as f:
    print >> f, "langs,count"
    bar = progressbar.ProgressBar(max_value=len(langs))
    for l, ct in bar(sorted(langs.items(), key=lambda x: -x[1])):
        print >> f, '%s,%s' % (l, ct)
Ejemplo n.º 25
0
if len(sys.argv) == 2:
    if '{' in sys.argv[1]:
        try:
            query = eval(sys.argv[1])
            if only_selected:
                query = {"$and": [query, {SELECTED_FIELD: True}]}
        except Exception as e:
            sys.stderr.write("WARNING: query wrongly formatted: %s\n" % sys.argv[1])
            sys.exit("%s: %s\n" % (type(e), e))
    elif os.path.exists(sys.argv[1]):
        with open(sys.argv[1]) as f:
            ids = sorted([t.get("id", t.get("_id")) for t in csv.DictReader(f)])
        if include_threads:
            ids = get_thread_ids_from_ids(ids, mongodb)
        query = {"_id": {"$in": ids}}
    else:
        query["text"] = re.compile(sys.argv[1].replace(' ', '\s+'), re.I)
elif len(sys.argv) > 2:
    query["$or"] = []
    for arg in sys.argv[1:]:
        query["$or"].append({"text": re.compile(arg.replace(' ', '\s+'), re.I)})

count = mongodb.count(query)
iterator = yield_csv(mongodb.find(query, sort=[("timestamp", 1)], limit=count), extra_fields=EXTRA_FIELDS)
if verbose:
    import progressbar
    bar = progressbar.ProgressBar(max_value=count)
    iterator = bar(iterator)
for t in iterator:
    print t
Ejemplo n.º 26
0
    parser.add_argument('--fastrun', dest='fastrun', action='store_true')
    parser.add_argument('--no-fastrun', dest='fastrun', action='store_false')
    parser.set_defaults(fastrun=True)
    args = parser.parse_args()
    log_dir = args.log_dir if args.log_dir else "./logs"
    run_id = datetime.now().strftime('%Y%m%d_%H:%M')
    __metadata__['run_id'] = run_id
    taxon = args.taxon
    fast_run = args.fastrun
    coll = MongoClient(args.mongo_uri)[args.mongo_db]["mygene"]

    # get metadata about sources
    # this should be stored in the same db under the collection: mygene_sources
    metadata_coll = MongoClient(
        args.mongo_uri)[args.mongo_db]["mygene_sources"]
    assert metadata_coll.count() == 1
    metadata = metadata_coll.find_one()

    log_name = '{}-{}.log'.format(__metadata__['name'], run_id)
    if wdi_core.WDItemEngine.logger is not None:
        wdi_core.WDItemEngine.logger.handles = []
    wdi_core.WDItemEngine.setup_logging(log_dir=log_dir,
                                        log_name=log_name,
                                        header=json.dumps(__metadata__),
                                        logger_name='gene{}'.format(taxon))

    if "microbe" in taxon:
        microbe_taxa = get_all_taxa()
        taxon = taxon.replace("microbe", ','.join(map(str, microbe_taxa)))

    for taxon1 in taxon.split(","):
Ejemplo n.º 27
0
from __future__ import division
from pymongo import MongoClient
from settings import Settings
import operator

#Intialize all collection
userscore_collection = MongoClient(Settings.MONGO_CONNECTION_STRING)[Settings.TOPICS_DATABASE][Settings.USERSCORE_COLLECTION]
businessscore_collection = MongoClient(Settings.MONGO_CONNECTION_STRING)[Settings.TOPICS_DATABASE][Settings.BUSINESSSCORE_COLLECTION]
reco_collection = MongoClient(Settings.MONGO_CONNECTION_STRING)[Settings.TOPICS_DATABASE][Settings.RECOMMENDATION_COLLECTION]


# Go through each user and compute top 20 Business for each User 
print userscore_collection.count()
print businessscore_collection.count()
userScoreCollection = userscore_collection.find()


bulk = reco_collection.initialize_unordered_bulk_op()
counter=0
bulkCounter = 0

for user in userScoreCollection:
    
    userTopics = user["userscore"]
    #print "length of user topics " + str(len(userTopics))
    
    ratings= { }
    businessScoreCollection = businessscore_collection.find()
    for business in businessScoreCollection:
        businessTopics = business["businessscore"]
        #print "length of Business topics " + str(len(businessTopics))
Ejemplo n.º 28
0
                    type=str)
parser.add_argument('protdbcoll_name',
                    help='MongoDB ProtDB Collection name',
                    type=str)
parser.add_argument('--host', help='MongoDB host (mongod or mongos)', type=str)
parser.add_argument('--port', help='MongoDB port (mongod or mongos)', type=int)
args = parser.parse_args()

if args.host:
    host = args.host
else:
    host = 'localhost'

if args.port:
    port = args.port
else:
    port = 27017

ProtColl = MongoClient(host, port)[args.protdb_name][args.protdbcoll_name]
half = int(ProtColl.count() / 2)

#half = 82817736 # in indexDB / ComPIL
f = sys.stdin
for protID in f:
    protID = int(protID)
    print(protID)
    if protID <= half:
        print(protID + half)
    elif protID > half:
        print(protID - half)
Ejemplo n.º 29
0
import json
from pymongo import MongoClient

try:
    with open(os.path.join(os.path.dirname(__file__), '..', 'config.json')) as confile:
         conf = json.loads(confile.read())
except Exception as e:
    sys.stderr.write("ERROR: Impossible to read config.json: %s %s\n" % (type(e), e))
    exit(1)

try:
    db = MongoClient(conf['mongo']['host'], conf['mongo']['port'])[conf['mongo']['db']]['links']
except Exception as e:
    sys.stderr.write("ERROR: Could not initiate connection to MongoDB: %s %s\n" % (type(e), e))
    exit(1)

verbose = True
if len(sys.argv) > 1 and "--quiet" in sys.argv:
    sys.argv.remove("--quiet")
    verbose = False

count = db.count()
iterator = db.find()
if verbose:
    import progressbar
    bar = progressbar.ProgressBar(max_value=count)
    iterator = bar(iterator)
print "url\tresolved"
for t in iterator:
    print ('%s\t%s' % (t["_id"], t["real"])).encode('utf-8')
Ejemplo n.º 30
0
    def get_data(table="cmnt"):
        '''
            table: table (collection)
        '''
        limit = request.args.get("limit", 10, type=int)
        page = request.args.get("page", 1, type=int)
        _db = MongoClient().safe_protocol[table]
        # data = _db.find().sort("time", -1).skip(limit * (page - 1)).limit(limit)
        if table == "alert":
            data_list = []
            alerts = _db.find().sort("time",
                                     -1).skip(limit * (page - 1)).limit(limit)
            total = _db.count()
            for alert in alerts:
                alert = {
                    'time': alert.get('time'),
                    'protocol_type': alert.get('type'),
                    'message': alert.get('message')
                }
                data_list.append(alert)

            protocol_type = request.args.get("type")
            if protocol_type:
                data_list = list(
                    filter(lambda x: x['protocol_type'] == protocol_type,
                           data_list))
                total = len(data_list)
            return {'data': data_list, 'total': total}

        elif table == "user":
            data_list = []
            users = _db.find().sort("create_time",
                                    -1).skip(limit * (page - 1)).limit(limit)
            total = _db.count()
            for user in users:
                data_list.append({
                    'user_id': user.get('_id'),
                    'username': user.get('name'),
                    'level': user.get('level'),
                    'create_time': user.get('create_time')
                })

            return {'data': data_list, 'total': total}

        elif table == "oper":
            data_list = []
            opers = _db.find().sort("time",
                                    -1).skip(limit * (page - 1)).limit(limit)
            total = _db.count()
            for oper in opers:
                oper = {
                    'user_id': oper.get('user_id'),
                    'username': oper.get('user_name'),
                    'time': oper.get('time'),
                    'protocol_type': oper.get('protocol_type'),
                    'oper': oper.get('oper')
                }
                data_list.append(oper)
            return {'data': data_list, 'total': total}

        elif table == "cmnt":
            data_list = []
            cmnts = _db.find().sort("time",
                                    -1).skip(limit * (page - 1)).limit(limit)
            total = _db.count()
            for cmnt in cmnts:
                cmnt = {
                    'time': cmnt.get('time'),
                    'buffer': cmnt.get('buffer'),
                    'ip': cmnt.get('ip')
                }
                data_list.append(cmnt)
            return {'data': data_list, 'total': total}
Ejemplo n.º 31
0
class Stat:
    '''
       This class is used  for
        1. Putting stat_list corresponsing to a specific timestamp
        2. Saving in mongoDB
    '''
    
    def __init__(self,config_file):
        self.docker = dockerGuest(config_file)
        self.config_file = config_file
        self.collection = MongoClient()['test'][self.get_collection()] # connecting to mongodb
        

    def get_collection(self):
        '''
          Getting mongodb collection(table) name from config file 
         (This method might be removed later...

       '''
        dir_path = os.path.dirname(os.path.realpath(__file__))
        file_path = os.path.join(dir_path,self.config_file)
        return json.load(open(file_path))["collection"]

    
    def get_config_file(self):
        return self.config_file
        
    @staticmethod
    def get_time():
        '''Getting current timestamp
      
           Storing python datetime object in mongodb.
           This might be changed later.
        '''        
        time = datetime.now()
        return time
    
    def data_to_save(self):
        '''
           Formats data to be saved in mongodb.
           JSON:
                {
                   time: timestamp,
                   stats: [{container1 stat },{ container2 stat } ...]
                }
        '''
        stats = self.docker.get_stats()
        if stats == None:  # if no stat that is there are no containers return None
            return None
        global_stat_dict ={}
        global_stat_dict["time"]=Stat.get_time()
        global_stat_dict["stats"] = stats
        return global_stat_dict

    def save(self,cap=60):
        ''' Saving in DB'''
        
        new_doc = self.data_to_save()
        if self.collection.count() == cap:
            '''
                If there are 60 items in db we delete the oldest timestamp data from db
                and insert the new item. We cannot simply delete an item because insertion in mongodb collection 
                is found out to be random.

            '''
            top_doc_time = min(doc['time'] for doc in self.collection.find()) #oldest timestamp. Simple if datetime objects are stored.
            self.collection.delete_one({'time':top_doc_time})  #delete oldest timestamp
            logger.info("Deleted timestamp is...{}".format(top_doc_time))   
        self.collection.insert_one(new_doc)  #insert new data
        logger.info("Saved in DB...{}".format(new_doc["time"]))


    def save_data(self):
       ''' method not used '''
       
        data = self.data_to_save()
        if data!=None:
            if self.is_db_full():
                self.make_space_db()
            self.collection.insert_one(data)
            logger.info('Saved in DB...')
Ejemplo n.º 32
0
import unittest.mock
Ejemplo n.º 33
0
                           'config.json')) as confile:
        conf = json.loads(confile.read())
except Exception as e:
    sys.stderr.write("ERROR: Impossible to read config.json: %s %s\n" %
                     (type(e), e))
    exit(1)

try:
    db = MongoClient(conf['mongo']['host'],
                     conf['mongo']['port'])[conf['mongo']['db']]['links']
except Exception as e:
    sys.stderr.write(
        "ERROR: Could not initiate connection to MongoDB: %s %s\n" %
        (type(e), e))
    exit(1)

verbose = True
if len(sys.argv) > 1 and "--quiet" in sys.argv:
    sys.argv.remove("--quiet")
    verbose = False

count = db.count()
iterator = db.find()
if verbose:
    import progressbar
    bar = progressbar.ProgressBar(max_value=count)
    iterator = bar(iterator)
print "url\tresolved"
for t in iterator:
    print('%s\t%s' % (t["_id"], t["real"])).encode('utf-8')
Ejemplo n.º 34
0
class MongodbUtil(object):
    """
    - .bashrc 또는 .bashprofile 에 MYSQL_PASSWD 를 설정해야 함.
    """

    def __init__(self, mongo_url, db_name, collection_name, auto_connect=False):
        """
        :param mongo_url: host, port, username, password, auth db
        :param db_name: database name
        :param collection_name: collection name
        :param auto_connect: default do not connect for multiprocessing (http://api.mongodb.com/python/current/faq.html#using-pymongo-with-multiprocessing)
        """
        self.mongo_url = mongo_url
        self.db_name = db_name
        self.collection_name = collection_name
        self.auto_connect = auto_connect
        self.collection = MongoClient(mongo_url, socketKeepAlive=True, connect=auto_connect)[db_name][collection_name]

    def __repr__(self):
        return '%s (db_name:%s, collection_name:%s, auto_connect:%s)' % (
            StringUtil.mask_passwd_in_url(self.mongo_url), self.db_name, self.collection_name, self.auto_connect)

    def __str__(self):
        return self.__repr__()

    def find(self, query=None, sort=None, limit=0):
        if query is None:
            query = {}
        if sort is None:
            sort = [('_id', ASCENDING)]

        for row in self.collection.find(query, no_cursor_timeout=True).sort(sort).limit(limit):
            yield row

    def count(self, query=None):
        if query is None:
            query = {}
        return self.collection.count(query, no_cursor_timeout=True)

    def find_one(self, query: dict, limit=0) -> dict:
        return self.collection.find_one(query, no_cursor_timeout=True).limit(limit)

    def create_index(self, field_list=None, unique=False):
        if field_list is None:
            field_list = []
        for field in field_list:
            self.collection.create_index([(field, ASCENDING)], background=True, unique=unique)
        return

    def insert(self, row: dict):
        return self.collection.insert_one(row)

    def update_one(self, where_query: dict, update_content: dict, upsert=False):
        return self.collection.update_one(
            where_query,
            update_content,
            upsert=upsert
        )

    def update(self, where_query: dict, update_content: dict, upsert=False):
        return self.collection.update_many(
            where_query,
            update_content,
            upsert=upsert
        )

    def save(self, row):
        return self.collection.save(row)

    def delete(self, where_query: dict):
        result = self.collection.delete_one(where_query)
        if result:
            return result.deleted_count
        return 0

    def drop(self):
        return self.collection.drop()
Ejemplo n.º 35
0
	reviews = corpus_collection.find({'business_id': rest['_id']})
	if reviews.count() > 9:
		print i
		list.append((i,reviews.count()))
		f.write (str(i))
		f.write('\t')
		f.write(str(reviews.count()))
		f.write('\n')

print len(list)
'''	
a = [4.5,4.3,5]
b = [3.1,4.3,5]
rest_rating = []
print 'Cor: ' 
pr =  pearsonr(a,b)[0]
pr = round(pr*100,2)
print '%r %%'% pr

print restaurant_cursor.count()
print corpus_collection.count()

for i in range(2):
	print i
	rest =restaurant_cursor.__getitem__(i)
	reviews = corpus_collection.find({'business_id': rest['_id']})
#print (reviews.count())
	#print ("Restaurant : %s" % rest['name'])
	#print ("Restaurant stars: %s" % rest['stars'])
	rest_rating.append(rest['stars'])
print (rest_rating)
Ejemplo n.º 36
0
import json, sys
import progressbar
from pymongo import MongoClient
from gazouilloire.web.export import format_csv

with open('config.json') as confile:
    conf = json.loads(confile.read())

db = MongoClient(conf['mongo']['host'], conf['mongo']['port'])[conf['mongo']['db']]['tweets']

urls = {}
query = {}
#query["langs"] = "fr"
print "Counting matching results..."
count = db.count(query)

print "Querying and hashing results..."
bar = progressbar.ProgressBar(max_value=count)
for t in bar(db.find(query, limit=count, projection={"links": 1, "proper_links": 1})):
    for l in t.get("proper_links", t["links"]):
        if l not in urls:
            urls[l] = 0
        urls[l] += 1

print "Sorting and storing csv data..."
with open("shared_urls.csv", "w") as f:
    print >> f, "url,shares"
    bar = progressbar.ProgressBar(max_value=len(urls))
    for link, shares in bar(sorted(urls.items(), key = lambda x: -x[1])):
        print >> f, '%s,%s' % (format_csv(link), shares)
Ejemplo n.º 37
0
    elif os.path.exists(sys.argv[1]):
        with open(sys.argv[1]) as f:
            ids = sorted(
                [t.get("id", t.get("_id")) for t in csv.DictReader(f)])
        if include_threads:
            ids = get_thread_ids_from_ids(ids, mongodb)
        query = {"_id": {"$in": ids}}
    else:
        query["text"] = re.compile(sys.argv[1].replace(' ', '\s+'), re.I)
elif len(sys.argv) > 2:
    query["$or"] = []
    for arg in sys.argv[1:]:
        query["$or"].append(
            {"text": re.compile(arg.replace(' ', '\s+'), re.I)})

if limit:
    total = limit
elif count and verbose:
    total = mongodb.count(query)
else:
    total = mongodb.count()
iterator = yield_csv(mongodb.find(query, sort=[("timestamp", 1)], limit=total),
                     extra_fields=EXTRA_FIELDS)

if verbose:
    import progressbar
    bar = progressbar.ProgressBar(max_value=total)
    iterator = bar(iterator)
for t in iterator:
    print t
Ejemplo n.º 38
0
    with open(os.path.join(os.path.dirname(__file__), '..',
                           'config.json')) as confile:
        conf = json.loads(confile.read())
except Exception as e:
    sys.stderr.write("ERROR: Impossible to read config.json: %s %s" %
                     (type(e), e))
    exit(1)

try:
    mongodb = MongoClient(conf['mongo']['host'],
                          conf['mongo']['port'])[conf['mongo']['db']]['tweets']
except Exception as e:
    sys.stderr.write("ERROR: Could not initiate connection to MongoDB: %s %s" %
                     (type(e), e))
    exit(1)

verbose = True
if len(sys.argv) > 1 and "--quiet" in sys.argv:
    sys.argv.remove("--quiet")
    verbose = False

print "id"
iterator = mongodb.find(projection=["_id"])
if verbose:
    import progressbar
    count = mongodb.count()
    bar = progressbar.ProgressBar(max_value=count)
    iterator = bar(iterator)
for t in iterator:
    print t["_id"]
from pymongo import MongoClient
import time

while True:
    client = MongoClient()['spider']['1024']
    print(client.count())
    time.sleep(2)
Ejemplo n.º 40
0
import os
import json
from random import randint
from flask import Flask, jsonify, request
from fighter import Arena, Fighter
from pymongo import MongoClient

app = Flask(__name__)
arena = Arena(8)

repos = MongoClient().githunt.repos
repos_count = repos.count()

fights = MongoClient().gitfighter.fights

def get_full_name(repo):
    return '%s/%s' % (repo['owner']['login'], repo['name'])

@app.route("/arena")
def arena_json():
    print(arena.json)
    return jsonify(arena.json)

@app.route("/fight", methods=['POST'])
def fight():
    fighters_stats = request.json
    print fighters_stats
    arena.set_fighters(fighters_stats)
    arena.start()
    fight = {"log": arena.log}
    response = jsonify(fight)