def combine_files():
    """
    The function is used to combine all the reviews for a business and write the business_id, the combined text of
    all the reviews and the category into a JSON file
    :return: None
    """
    restaurant_collection = MongoClient('localhost', 29017).yelp.restaurants
    review2_collection = MongoClient('localhost', 29017).yelp.review2
    output_file = open("RestaurantCombinedReviews.json", 'w')
    cursor = restaurant_collection.find()
    line = 0
    for entry in cursor:
        business_id = entry["business_id"]
        category = entry["categories"]
        review2_cursor = review2_collection.find({"business_id": business_id})
        review_text = ""
        for business_entry in review2_cursor:
            review_text = review_text + business_entry["text"]
            # print json.dumps(vars(obj))
        if review_text:
            line += 1
            obj = Business(business_id, review_text, category)
            output_file.write(json.dumps(vars(obj)))
            output_file.write("\n")
        if line % 100 == 0:
            print line
    output_file.close()
Exemple #2
0
def main():
    x=7;
    input = open(sys.argv[1], encoding='utf-8')
    words = MongoClient().db.words
    words2 = MongoClient().db.words2
    output = open("output.txt","w", encoding='utf-8')

    for line in input:
        prev=None
        pprev=None
        for word in line.split():
            if prev!=None:
                if words.find_one({"first": prev,"second": word,"grade": { "$exists": True}}) != None:
                              words.update({"first": prev,"second": word},{ "$inc": {"grade":1}})
                else:
                    words.insert({"first": prev,"second": word,"grade":1})
                if pprev!=None:
                    if words2.find_one({"first": pprev,"second": prev,"third": word,"grade": { "$exists": True}}) != None:
                              words2.update({"first": pprev,"second": prev,"third": word},{ "$inc": {"grade":1}})
                    else:
                        words2.insert({"first": pprev,"second": prev,"third": word,"grade":1})
                pprev=prev
            prev = word
    for i in words.find(): output.write("first: "+str(i["first"])+"  second: "+str(i["second"])+"  grade: "+str(i["grade"])+"\n")
    for i in words2.find(): output.write("first: "+str(i["first"])+"  second: "+str(i["second"])+"  third: "+str(i["third"])+"  grade: "+str(i["grade"])+"\n")
Exemple #3
0
class MongoCorpus(SimpleCorpus):
    """
        Corpus wrapper around a MongoDB collection.
        Subset corpus by setting a query. If "aggregate" is used,
        this will override "query". In this case use "$match" in
        aggregation method.
    """
    def __init__(self, db, collection, aggregate=[], query={}):
        self.client = MongoClient()[db][collection]
        self.aggregate_arg = aggregate
        self.find_arg = query

    def __iter__(self):
        """
            _obj_ is a dictionary: you can filter the right
            key to feed only docs text.
        """
        collection = self.client.find(self.find_arg, no_cursor_timeout=True) \
                        if len(self.aggregate_arg) == 0 \
                else self.client.aggregate(self.aggregate_arg)
        for doc in collection:
            yield doc

        collection.close()

    def __len__(self):
        if len(self.aggregate_arg) == 0:
            return self.client.find(self.find_arg).count()
        else:
            d = next(self.client.aggregate(self.aggregate_arg +
                        [{"$group": {"_id": "null", "count": {"$sum": 1}}}]))
            return d['count']
def pull_mongo_data(db, col):

	# Connect to MongoDB
	cur = MongoClient()[db][col]

	# Query DB, looking for all sources to take inventory
	db_results = cur.find({}, { "source" : 1 } )

	# Gather unique topics
	all_sources = list(set([story["source"] for story in db_results]))

	stories = []
	for source in all_sources:

		# Query the DB for all results that match the given source
		db_results = cur.find({ "source" : source }).limit(stories_per_agency)
		print source + " " + str(db_results.count())
		stories.extend(db_results)

	# Filter results based on alignment if using 'aln_6'
	if (predict == "aln6"):
		stories = [story for story in stories if story["source"] in alignment_6]


	# Grab unique links
	# print len(stories)
	# unique = list(set([(story['link'], story['source']) for story in stories]))

	# agency = "washingtonpost"
	# print agency + ": " + str(len([story[1] for story in unique if story[1] == agency]))

	return stories
def get_restaurant_reviews():
    """
    The function is used to get the reviews that corresponds to the restaurants business. The number of reviews would be
    49876. The details are written to a JSON file.
    :return: None
    """
    restaurant_collection = MongoClient('localhost', 29017).yelp.restaurants
    review_collection = MongoClient('localhost', 29017).yelp.review2
    restaurant_cursor = restaurant_collection.find({},{"business_id": 1, "_id": 0})
    output_file = open("RestaurantReviews.json", "w")
    line = 0
    for res_entry in restaurant_cursor:
        business_id = res_entry["business_id"]
        review_cursor = review_collection.find({"business_id": business_id})
        for review_entry in review_cursor:
            business_id = review_entry["business_id"]
            text = review_entry["text"]
            stars = review_entry["stars"]
            if text:
                line += 1
                obj = Review(business_id, text, stars)
                output_file.write(json.dumps(vars(obj)))
                output_file.write("\n")
            if line % 100 == 0:
                print line
class TestBucketIntegration(unittest.TestCase):
    def setUp(self):
        self.db = database.Database(HOST, PORT, DB_NAME)
        self.bucket = bucket.Bucket(self.db, BUCKET)
        self.mongo_collection = MongoClient(HOST, PORT)[DB_NAME][BUCKET]

    def setup__timestamp_data(self):
        self.mongo_collection.save({
            "_id": 'last',
            "_timestamp": d_tz(2013, 3, 1),
            "_week_start_at": d_tz(2013, 2, 25)
        })
        self.mongo_collection.save({
            "_id": 'first',
            "_timestamp": d_tz(2013, 1, 1),
            "_week_start_at": d_tz(2012, 12, 31)
        })
        self.mongo_collection.save({
            "_id": 'second',
            "_timestamp": d_tz(2013, 2, 1),
            "_week_start_at": d_tz(2013, 1, 28)
        })

    def tearDown(self):
        self.mongo_collection.drop()

    def test_that_records_get_sent_to_mongo_correctly(self):
        my_record = Record({'foo': 'bar'})
        self.bucket.store(my_record)

        collection = self.mongo_collection.find()
        assert_that(list(collection), only_contains(
            has_entries({"foo": "bar"})
        ))

    def test_that_a_list_of_records_get_sent_to_mongo_correctly(self):
        my_records = [
            Record({'name': 'Groucho'}),
            Record({'name': 'Harpo'}),
            Record({'name': 'Chico'})
        ]

        self.bucket.store(my_records)

        collection = self.mongo_collection.find()
        assert_that(list(collection), only_contains(
            has_entries({'name': 'Groucho'}),
            has_entries({'name': 'Harpo'}),
            has_entries({'name': 'Chico'})
        ))

    def test_period_queries_get_sorted_by__week_start_at(self):
        self.setup__timestamp_data()
        query = Query.create(period="week")
        result = query.execute(self.bucket.repository)
        assert_that(result.data(), contains(
            has_entry('_start_at', d_tz(2012, 12, 31)),
            has_entry('_start_at', d_tz(2013, 1, 28)),
            has_entry('_start_at', d_tz(2013, 2, 25))
        ))
Exemple #7
0
class MongoStore(Store):
    def __init__(self, db, collection, url='mongodb://localhost'):
        self.collection = MongoClient(url)[db][collection]

    def fetch(self, oid):
        return self.collection.find_one({'_id':oid})

    def fetch_all(self):
        return self.collection.find()

    def iter_ids(self):
        for obj in self.collection.find({}, {'_id':True}):
            yield obj['_id']

    def save(self, obj):
        self.collection.save(obj)        
   
    def save_many(self, obj_iter):
        self.collection.insert(obj_iter)    

    def flush(self):
        self.collection.drop()

    def delete(self, oid):
        self.collection.delete_one({'_id':oid})
 def POST(self):
     try:
         error_message=None
         data = cherrypy.request.json
         client = MongoClient(DATABASE_ADDRESS, DATABASE_PORT)
         db = client[data['project_name']]
         version = db.diagram_versions.count()+1
         db.diagram_versions.insert({'viewpoint':data['viewpoint'], 'diagram_name':data['diagram_name'], 'diagram_version':str(version), 'date':data['date'], 'deleted':False})
         if version > 1:
             # Insert elements from previous version in new one
             diagram_elements = MongoClient().editor.diagram_elements
             diagram_connections = MongoClient().editor.diagram_connections
             elems = list(diagram_elements.find({'viewpoint':data['viewpoint'], 'diagram_name':data['diagram_name'], 'diagram_version':ver, 'deleted':False}))
             cons = list(diagram_connections.find({'viewpoint':data['viewpoint'], 'diagram_name':data['diagram_name'], 'diagram_version':ver, 'deleted':False}))
             for elem in elems:
                 elem['diagram_version'] = version
                 diagram_elements.insert(elem)
             for conn in cons:
                 conn['diagram_version'] = version
                 diagram_connections.insert(conn)
         return build_response(token=None)
     except Exception:
         if not error_message:
             error_message = EXCEPTION_PROCESSING_ERROR
         return build_response(error_message=error_message,token=None)
class MongoIterator(object):

    def __init__(self, uri, db, collection, skip=0, limit=0, filter=None):
        self._collection = MongoClient(uri)[db][collection]
        self._skip = skip
        self._limit = limit
        self._filter = filter

    def __iter__(self):
        return self.stream()

    def stream(self, conditions=None, projection=None, skip=None, limit=None):
        proj = {k: 1 for k in projection} if projection else {}

        if proj:
            proj.update({'_id': False})  # skip internal id

        return self._collection.find(conditions or self._filter, proj or None, skip=skip or self._skip, limit=limit or self._limit)

    def size(self):
        return self._collection.count() if not self._filter else self._collection.find(self._filter).count()

    @property
    def filter(self):
        return self._filter

    @filter.setter
    def filter(self, conditions):
        self._filter = conditions
Exemple #10
0
class halo:
	def __init__(self):
		self.config = json.load(open("../config/config.json"))
		self.sparql = SPARQLWrapper("http://dbpedia.org/sparql")
		self.termDB = MongoClient()["semantified"]["terms"]
		self.halodb = MongoClient()["halo"]["halos"]
	def run(self,query):
		try:
			self.sparql.setQuery(query)
			self.sparql.setReturnFormat(JSON)
			result = self.sparql.query()
			#jsonlayer.use('cjson')
			body = result.response.read().encode('ascii','ignore')
			fixed_body = body.decode("ascii")
			result = jsonlayer.decode(fixed_body)
			return result["results"]["bindings"]
		except :
			print(query)
			time.sleep(60)
			return self.run(query)

	def makeQuery(self,uri,querykey):
		return  self.config[querykey] % (uri)
	def insert(self,obj):
		self.halodb.update({"_id":obj["_id"]},obj,True)
	def isprocessed(self,uri):
		return len(list(self.halodb.find({"_id" : uri}))) > 0
	def getHalo(self,uri):
		if not self.isprocessed(uri):
			query = self.makeQuery(uri,"queryone")
			result = self.run(query)
			query = self.makeQuery(uri,"querytwo")
			result.extend(self.run(query))
			halo = {}
			halo["_id"] = uri
			halo["uri"] = uri
			halo["halo"] = {}
			for each in result:
				halouri = each["aura"]["value"]
				halo["halo"][halouri.replace(".","$")] = {}
				obj = {}
				obj["halouri"] = halouri
				obj["count"] = each["auraCount"]["value"]
				obj["label"] = each["label"]["value"]
				halo["halo"][halouri.replace(".","$")] = obj
			self.insert(halo)
			print("processed halo for : " + uri)
		else :
			print("previously processed uri : " + uri )
	def getdatadb(self):
		return self.termDB.find(timeout=False)
	
	def processhalofromdb(self):
		data = self.getdatadb()
		for each in data :
			alluri = each["allURI"]
			map(self.getHalo,alluri)
def most_frequent_pairs(topic_id):
    """
    Compute most frequent stemmed word pairs for the specified Reddit topic id.
    Skip stop words.
    :param topic_id: topic id (subreddit_id)
    :type topic_id: str
    :return: sorted list of most frequent stemmed word pairs (from most to less frequent)
    """
    topwords = dict()  # dict used to count frequency for each stemmed word
    articles = MongoClient().reddit.articles
    res = articles.find({'subreddit_id': topic_id, 'created_utc': {'$gt': fromtime}},
                        {'_id': 0, 'title': 1})
    for art in res:
        words = r.sub(' ', art['title'].lower()).split()  # alphachars only
        words = [st.stem(w) for w in words if (w not in stop_words) and (len(w) > 1)]  # stemmed words
        for word in words:
            if len(word) > 2:  # stemmed words longer than 2 chars only
                if topwords.get(word) is None:
                    topwords[word] = 1
                else:
                    topwords[word] += 1

    # keep words with frequency >= 3
    for k, v in topwords.items():
        if v < 3:
            del topwords[k]

    # create a set of topwords for each article that contains at least one
    matrix = dict()
    for word in topwords:
        res = articles.find({'$text': {'$search': word}}, {'_id': 0, 'id_reddit': 1})  # full text search
        for item in res:
            if matrix.get(item['id_reddit']) is None:
                matrix[item['id_reddit']] = set([])  # create initial empty set
            matrix[item['id_reddit']].add(word)

    # keep sets with cardinality >= 2
    for k, v in matrix.items():
        if len(v) < 2:
            del matrix[k]

    # count frequency for all possible pairs of topwords
    pairs = dict()
    for v in matrix.values():
        v = sorted(v)
        for i in range(len(v) - 1):
            for j in range(i + 1, len(v)):
                idx = v[i], v[j]
                if pairs.get(idx) is None:
                    pairs[idx] = 1
                else:
                    pairs[idx] += 1

    return sorted(pairs.items(), key=itemgetter(1), reverse=True)  # sorted from most to less frequent
class TvrainData:
    def __init__(self):
        """
        Just load data from Mongo.
        """
        self.sequences = MongoClient(os.environ['MONGODB_URL']).tvrain.sequences
        self.collection = MongoClient(os.environ['MONGODB_URL']).tvrain.articles
        self.collection.create_index("time")

    def get_random_articles(self, n):
        """Returns N of topics for index.html"""
        articles = self.collection.find().sort("time", 1).skip(random.randint(0, self.collection.count())).limit(n)
        return list(articles)

    def get_article_id(self, url):
        """Get id by url"""
        return self.collection.find_one({'url': url})['_id']

    def get_articles_data(self, articles_urls):
        """
        Get data from MongoDB for articles urls
        :param articles_urls: ['article_url', ...]
        :return: list of MongoDB documents
        """
        articles = []
        for url in articles_urls:
            articles.append(self.collection.find_one({'url': url}))
        return articles

    def iterate_articles(self, except_articles, skip=0, limit=None, query=None):
        """
        Iteate throw all articles without ids of except articles
        :param except_articles: list of ids
        :return:
        """
        if query is None:
            query = {}
        if limit is None:
            data = self.collection.find(query).skip(skip)
        else:
            data = self.collection.find(query).skip(skip).limit(limit)

        for value in data:
            if value['_id'] not in except_articles:
                yield value

    def get_sequences(self):
        """Return all sequences for train"""
        return list(self.sequences.find().limit(-1))
    def __call__(self, pair, frame=False):
        """ returns raw chart data from the mongo database, updates/fills the
        data if needed, the date column is the '_id' of each candle entry, and
        the date column has been removed. Use 'frame' to restrict the amount
        of data returned.
        Example: 'frame=api.YEAR' will return last years data
        """
        # use last pair and period if not specified
        if not frame:
            frame = self.api.YEAR * 10
        dbcolName = pair + 'chart'
        # get db connection
        db = MongoClient()['poloniex'][dbcolName]
        # get last candle
        try:
            last = sorted(
                list(db.find({"_id": {"$gt": time() - 60 * 20}})),
                key=itemgetter('_id'))[-1]
        except:
            last = False
        # no entrys found, get all 5min data from poloniex
        if not last:
            logger.warning('%s collection is empty!', dbcolName)
            new = self.api.returnChartData(pair,
                                           period=60 * 5,
                                           start=time() - self.api.YEAR * 13)
        else:
            new = self.api.returnChartData(pair,
                                           period=60 * 5,
                                           start=int(last['_id']))
        # add new candles
        updateSize = len(new)
        logger.info('Updating %s with %s new entrys!',
                    dbcolName, str(updateSize))

        # show the progess
        for i in range(updateSize):
            print("\r%s/%s" % (str(i + 1), str(updateSize)), end=" complete ")
            date = new[i]['date']
            del new[i]['date']
            db.update_one({'_id': date}, {"$set": new[i]}, upsert=True)
        print('')

        logger.debug('Getting chart data from db')
        # return data from db (sorted just in case...)
        return sorted(
            list(db.find({"_id": {"$gt": time() - frame}})),
            key=itemgetter('_id'))
Exemple #14
0
class DisIO:
    def __init__(self):
        self.db = MongoClient(
            'localhost',
            27017).get_database('orig').get_collection('sentences')

    def sen_from_mongo(self):
        cursor = self.db.find({})
        count = 0
        sen_all = ""
        for sen in cursor:
            sen_all = sen_all + sen['text']
            count += 1
            if count % 10000 == 0:
                print("mongo" + str(count))
        return sen_all

    def re_to_text(self, cut):
        length = len(cut)
        if (length == 0):
            print("NO Results")
        else:
            jieba_sum = 0.0
            thulac_sum = 0.0
            # dis = open(path, 'a', encoding='utf-8')
            for i in range(0, length):
                if i % 10000 == 0:
                    print("dis" + str(i))
                jieba_sum += cut[i]["jieba_overlap"]
                thulac_sum += cut[i]["thulac_overlap"]
            print("jieba:" + str(jieba_sum / length) + "  thulac:" +
                  str(thulac_sum / length) + "\n")
Exemple #15
0
def find_users():
    """Busca usuarios en funcion de su nombre, apellidos y fecha de nacimiento
    """
    
    # http://localhost:8080/find_users?name=Luz
    # http://localhost:8080/find_users?name=Luz&surname=Romero
    # http://localhost:8080/find_users?name=Luz&&surname=Romero&birthdate=2006-08-14
    name = request.query.name
    surname = request.query.surname
    birthdate = request.query.birthdate
    
    print (name, surname, birthdate)
    
    dicc = dict(request.query)
    consulta={}
    
    
    for x in dicc:
        if x == "name":
            consulta["name"]= name
        elif x =="surname":
            consulta["surname"]= surname
        elif x=="birthdate":
            consulta["birthdate"]= birthdate
        else:
            return template('error.tpl', error=("La consulta no admite el parametro " + str(x)))
            
    collection = MongoClient('localhost',27017).giw.usuarios    
    
    cursor = collection.find(consulta)      
    
    count = make_table_ten_columns(cursor, "find_users_result")
    return template('find_users_result.tpl', usuarios = count, name=name, surname=surname, birthdate=birthdate)
Exemple #16
0
class ProductService:
    def __init__(self):
        self.schema = Product()

        self.mongodb_uri = os.getenv('MONGODB_URI') if os.getenv(
            'MONGODB_URI') else 'localhost:27017'
        self.mongodb_name = 'Store'
        self.mongodb_collection = 'product'

        self.db_connection = MongoClient(self.mongodb_uri).get_database(
            self.mongodb_name).get_collection(self.mongodb_collection)

    def create_product(self, payload):
        try:
            data = self.schema.load(payload)
            created_id = self.db_connection.insert_one(data).inserted_id
        except ValidationError as validation_error:
            raise validation_error
        return created_id

    def get_product(self, product_id):
        product = self.db_connection.find_one({
            '_id': UUID(product_id),
            'enabled': True
        })
        if product:
            return self.schema.dump(product)
        raise FileNotFoundError()

    def get_all_products(self):
        products = self.db_connection.find({'enabled': True})
        if products:
            return self.schema.dumps(products, many=True)
        raise FileNotFoundError()
class Ticker(object):

    def __init__(self, api, interval=1):
        self.api = api
        self.db = MongoClient().poloniex['ticker']
        self.interval = interval

    def updateTicker(self):
        tick = self.api.returnTicker()
        for market in tick:
            self.db.update_one({'_id': market},
                               {'$set': tick[market]},
                               upsert=True)
        logger.info('Ticker updated')

    def __call__(self):
        return list(self.db.find())

    def run(self):
        self._running = True
        while self._running:
            self.updateTicker()
            sleep(self.interval)

    def start(self):
        self._thread = Thread(target=self.run)
        self._thread.daemon = True
        self._thread.start()
        logger.info('Ticker started')

    def stop(self):
        self._running = False
        self._thread.join()
        logger.info('Ticker stopped')
Exemple #18
0
class ReadThread(Thread):
    def __init__(self, host, port, db, coll_name, conn_options):
        Thread.__init__(self)
        self.coll = MongoClient(host, port, **conn_options)[db][coll_name]

    def run(self):
        global ReadsPerSec
        global ShouldExit

        ReadsPerSec = 0
        interval = timedelta(seconds=1.8)
        last = datetime.now()
        things = 0

        while not ShouldExit:
            for i in xrange(10):
                list(self.coll.find({'not_a_key': {'$ne': 1}}))
                things += 1

            now = datetime.now()
            elapsed = now - last
            if elapsed >= interval:
                ReadsPerSec = things / elapsed.total_seconds()
                things = 0
                last = now
def get_events_ids_by_project_id(project_id):
    # project_id string 
    query = { 'project_id' : ObjectId(project_id) }
    fields = { '_id': 1 }
    event_types_collection = MongoClient(mongo_ip, mongo_port)[mongo_db_name][event_collection_name]
    cursor = event_types_collection.find(query,fields)
    return [str(et_id['_id']) for et_id in cursor]
Exemple #20
0
def main():
    # Download the necessary information needed for gensim
    # nltk.download()

    # Get the count of pages stored in MongoDb
    pages_db = MongoClient(GlobalSettings.MONGO_URI)[
        GlobalSettings.DATABASE_DOT][GlobalSettings.COLLECTION_PAGES]
    pages_cursor = pages_db.find()
    count = pages_cursor.count()

    # Create workers to go through the pages in separate batches
    workers = GlobalSettings.WORKERS
    batch = count / workers
    left = count % workers

    # Kick off each worker until all batches have completed processing
    jobs = []
    for i in range(workers):
        size = count / workers
        if i == (workers - 1):
            size += left
        p = multiprocessing.Process(target=worker,
                                    args=((i + 1), i * batch, size))
        jobs.append(p)
        p.start()

    for j in jobs:
        j.join()
        print '%s.exitcode = %s' % (j.name, j.exitcode)
Exemple #21
0
def insert_questions_from_followed_question():
    in_db = MongoClient().zhihu.user_followed_questions
    out_db = MongoClient().zhihu_network.questions
    existed_question_id = set(map(lambda q: q['_id'], out_db.find()))
    segmentor = Segmentor()
    segmentor.load("/Users/sunxiaofei/workspace/ltp_data/cws.model")
    for u in in_db.find():
        for q in u['questions']:
            if q['id'] in existed_question_id:
                continue
            existed_question_id.add(q['id'])
            words = segmentor.segment(q['title'].strip().replace(
                '\n', ' ').replace('\r', ' ').replace('\b', ' '))
            if len(words) < 3:
                continue
            out_db.insert({'_id': q['id'], 'title': ' '.join(words)})
Exemple #22
0
class Database:
    DATABASE = db_config.DATABASE
    LINK = db_config.MONGO_LINK.format(os.environ.get('LOGIN'),
                                       os.environ.get('PASSWORD'))

    def __init__(self, collection):
        from pymongo.errors import ConnectionFailure
        try:
            self.db = MongoClient(self.LINK)[self.DATABASE][collection]
            print('Connect to {} successful'.format(collection))
        except ConnectionFailure:
            print('Connecting to {} error'.format(collection))
            sys.exit()

    def add_doc(self, doc):
        if self.db.find_one({'_id': doc['_id']}) is None:
            self.db.save(doc)
            return True
        return False

    def change_doc(self, _id, mode):
        self.db.find_one_and_update(_id, {'$set': mode})

    def delete_doc(self, _id):
        self.db.find_one_and_delete(_id)

    def get_docs(self, query=None):
        if query is None:
            query = {}
        return self.db.find(query)

    def get_one_doc(self, _id):
        return self.db.find_one(_id)
Exemple #23
0
def process_cursor(skip_n, limit_n):

    print('Starting process', skip_n // limit_n, '...')
    # Connect to the MongoDB
    collection = MongoClient().yelp_review.business_data
    cursor = collection.find({}).skip(skip_n).limit(limit_n)

    # Connect to the MySQL
    mydb = mysql.connector.connect(user='******',
                                   password='******',
                                   database='yelp_review')
    mycursor = mydb.cursor()

    # Insert query
    sql = 'INSERT INTO business (business_id, name, city, state, stars, review_count) VALUES (%s, %s, %s, %s, %s, %s)'

    # Loop through the cursor
    for doc in cursor:
        # Insert values from MongoDB to MySQL
        if doc['categories'] is None:
            continue

        if ('Restaurants' in doc['categories']) and (doc['state']
                                                     in ['ON', 'BC']):
            val = (doc['business_id'],
                   doc['name'].encode("ascii", "ignore").decode(), doc['city'],
                   doc['state'], doc['stars'], doc['review_count'])
            mycursor.execute(sql, val)
            mydb.commit()

    print('Completed process', skip_n // limit_n, '...')
Exemple #24
0
    def feature_click(self):
        col = self.mdb.click
        if not col.find_one():
            logging.info('click为空,设置主键为wblogId')
            col.create_index([('wblogId', pymongo.DESCENDING)], unique=True)

        ws = MongoClient().wblog.swblog
        ww = MongoClient().wblog.wblog
        for wblogId in self.all_wblog:
            if wblogId in self.swblog:
                pass
            else:
                wblog = ww.find_one({'wblogId': str(wblogId)})
                content = wblog['json_text']['text']
                if 'ttarticle' in content:
                    print('https:' +
                          content.split('ttarticle')[0].split(':')[-1] +
                          'ttarticle' +
                          content.split('ttarticle')[1].split('&')[0])

        for wblog in ws.find():
            content = wblog['json_text']['text']
            if 'ttarticle' in content:
                print('https:' + content.split('ttarticle')[0].split(':')[-1] +
                      'ttarticle' +
                      content.split('ttarticle')[1].split('&')[0])
def main():
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    business_ids=["9-pGDHbyIoP_KhguG6vI1Q","ArtsD3RqfCVjIRSZunIh_g","CVakWZjk_j44AB-Jbe0DPQ","iYk5QEI3IZmr25L3QWz4KQ","7Hmr1TDJah-14zprHUMlqw","J6i_Tt4dI7IUTIG9xaC8cg","7Hmr1TDJah-14zprHUMlqw","cjJvvEbpo9b_76hV_lyFXg","rtqtZ0_kOA-GP33mn6-Kpg","T-LhjPRqlS7hLGRmSMBbfA,9pGDHbyPOP-KhjikG6vI1Q","DohsD3RqfCPjIRSZun_Ihg","POakdwajk-j44ABJbe0DPQ","Yk5QEI3IZmr25L3QWz_4KQ","LOP1TDJah14zprHUM_lqw","POMTt4dI7IUTIG_9xaC8cg","JKLsr1TDJah14zprHUMlqw","PPJvvEbpo9b-76hV_lyFXg","MCd_tZ0kOAGP33mn6K-pg","PLLhjPRqlSfdvfvd-MBbfA"]       
    business_id="T-LhjPRqlS7hLGRmSMBbfA"
    reviews_collection = MongoClient(Settings.MONGO_CONNECTION_STRING)[Settings.REVIEWS_DATABASE][
    Settings.TEST_COLLECTION]
    reviews_cursor = reviews_collection.find({},{'text':1,'business':1})
    reviews_cursor2 = reviews_collection.find({},{'business':1})


    for reviews in reviews_cursor:
        if reviews["business"]==business_id:
            predict=Predict()
            predict.run(reviews["text"])
    """
    def run(self, new_review):
        topics={0:"trip",1:"experience",2:"location",3:["drinks", "margarita"],4:
["dessert", "fancy", "garden"],5:"parking",6:"facility",7:"con",8:["pizza", "delivery", "pie"
],9:"appointment",10:["bbq", "lunch", "sandwich", "meat", "american"],11:["seafood", "shrimp", "fish", "salmon"],
12:["location", "ice cream"],13:["hotdog", "american"],14:["customer service", "experience"
],15:["pasta", "calamari"],16:"juice",17:["burritto", "student", "refund"],18:
["steak", "meat", "lover"],19:["game", "video"],20:"security",21:["cheese", "salad", "vegetables"
],22:["parking", "tax"],23:["buffet", "price", "quality"],24:"pancake",25:["cake", "cupcake", "bakery"],26:["popcorn", "environment"],
27:"kid-friendly",28:["breakfast", "brunch", "waffle"],29:["music", "friend", "club", "bar"],
30:["hotel", "casino"],31:"architecture",32:"kid-friendly",33:"view",34:["service", "price", "staff"],
35:["chocolate", "candy", "dessert"],36:"crepe",37:["time", "service"],38:"online",39:"pricey",40:"pub",
41:["burger", "order" ,"service"],42:["ambience", "beef"],43:["spicy", "chicken", "meat"],44:["price", "discount"],
45:["wine", "service", "dessert"],46:["bar", "night", "bartender"],47:["taco", "mexican"],48:["cafe", "coffee"],
49:["party", "birthday"]
}
        business_ids=["9-pGDHbyIoP_KhguG6vI1Q","ArtsD3RqfCVjIRSZunIh_g","CVakWZjk_j44AB-Jbe0DPQ","iYk5QEI3IZmr25L3QWz4KQ","7Hmr1TDJah-14zprHUMlqw","J6i_Tt4dI7IUTIG9xaC8cg","7Hmr1TDJah-14zprHUMlqw","cjJvvEbpo9b_76hV_lyFXg","rtqtZ0_kOA-GP33mn6-Kpg","T-LhjPRqlS7hLGRmSMBbfA"]       
        business_id=business_ids[0]
        nouns = self.extract_lemmatized_nouns(new_review)
        new_review_bow = self.dictionary.doc2bow(nouns)
        
        new_review_lda = self.lda[new_review_bow]
        reviews_collection = MongoClient(Settings.MONGO_CONNECTION_STRING)[Settings.REVIEWS_DATABASE][
        Settings.TEST_COLLECTION]
        reviews_cursor = reviews_collection.find()
        answer=[]

        print (new_review_lda)
        
        for i,j in new_review_lda:
            answer.append(topics[i])
        print (new_review_lda)
        reviews_collection.update({'topic': ""},{'$set':{"topic":answer
                              }})
Exemple #27
0
def find_birth_month():
    """Busca usuarios nacidos en un mes en concreto
    """
    
    # http://localhost:8080/find_birth_month?month=abril
    month = request.query.month
    
    parser = dict({"enero": "-01-", "febrero": "-02-", "marzo": "-03-", "abril": "-04-", "mayo": "-05-", "junio": "-06-", "julio": "-07-", "agosto": "-08-", "septiembre": "-09-", "octubre": "-10-", "noviembre": "-11-", "diciembre": "-12-"})
    
    print (month)
    
    dicc = dict(request.query)
    
    for x in dicc:
        if x == "month" and month in parser.keys():
            consulta = parser[month]
        else:
            return template('error.tpl', error="Los parametros introducidos son incorrectos")
    
    print (consulta)
        
    collection = MongoClient('localhost',27017).giw.usuarios 
    
    cursor = collection.find({"birthdate" : {"$regex": consulta}}).sort("birthdate", 1)
    
    count = make_table_ten_columns(cursor, "find_birth_month_result")
    return template('find_birth_month_result.tpl', usuarios = count, month=month)
def get_training_reviews(size_of_training_review=None):
    """
    The function is used to read records from the review2 collection and then create a records in JSON format shown
    below:
        JSON format:
                    [
                        {"text": "Sample Text","label":"1"},
                        {"text": "Sample Text","label":"2"}
                    ]
        The JSON data is written to a file.
    :param size_of_training_review: example 1000, 2000 etc
    :return: None
    """
    review_collection = MongoClient('localhost', 29017).yelp.review2
    review_cursor = review_collection.find()
    training_file = open("TrainingReviews_2.json", "w")
    line = 0
    training_file.write("[\n")
    for entry in review_cursor:
        if line < size_of_training_review:
            text = entry["text"]
            rating = entry["stars"]
            if text:
                line += 1
                obj = TrainReview(text, rating)
                str_line = json.dumps(vars(obj))
                if line < size_of_training_review:
                    training_file.write("\t" + str_line + ",\n")
                else:
                    training_file.write(str_line)
            if line % 100 == 0:
                print line
        else:
            break
    training_file.write("]")
Exemple #29
0
def find_leap_year():
    """Busca usuarios nacidos en años bisiestos cuya tarjeta de credito caduque en el año pasado por parametro
    """
    
    # http://localhost:8080/find_leap_year?exp=20
    exp = request.query.exp
    
    
    dicc = dict(request.query)
        
    for x in dicc:
        if x == "exp" and len(exp)==2:
            consulta = exp
        else:
            return template('error.tpl', error="Los parametros introducidos son incorrectos")
    
    bisiesto = """function() {
                        if ("birthdate" in this) {
                                let year = Number(this["birthdate"].substr(0, 4));
                                if (year%4==0 && (!(year%100==0) || (year%400==0)))
                                    return true;
                                else
                                    return false;
                        }
                        else return false
                }"""   
        
    collection = MongoClient('localhost',27017).giw.usuarios
    
    cursor = collection.find({"credit_card.expire.year":consulta, "$where": bisiesto})

    count = make_table_ten_columns(cursor, "find_leap_year_result")
    return template('find_leap_year_result.tpl', usuarios=count, exp=consulta)
Exemple #30
0
def main():
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)

    # 连接MongoDB,读取待分类数据
    corpus_collection = MongoClient(
        "mongodb://39.108.180.114:27017")["ennews"]["news"]
    reviews_cursor = corpus_collection.find(no_cursor_timeout=True)

    # 分类
    find_topics = FindTopics(corpus_collection, reviews_cursor)
    topics_matrix = find_topics.run()

    # 输出主题矩阵
    # make a copy of original stdout route
    stdout_backup = sys.stdout
    # define the log file that receives your log info
    log_file = open(".\lda_topics2.log", "w")
    # redirect print output to log file
    sys.stdout = log_file
    print(str(topics_matrix))
    log_file.close()
    # restore the output to initial pattern
    sys.stdout = stdout_backup

    reviews_cursor.close()
Exemple #31
0
class RemoteIO:
    def __init__(self):
        time_counter(print_to_console=False)
        print("初始化 RemoteIO")
        self.db = MongoClient('192.168.68.11', 20000).get_database(
            "tokenizer_qiao").get_collection('splited_sentences')
        self.sentence_size = self.db.find().count()
        self.step = self.sentence_size
        self.skip = 0
        time_counter("初始化完毕")

    def read_sentence_randomly(self):
        while self.skip + self.step >= self.sentence_size:
            print("skip:%d, step:%d, size:%d" %
                  (self.skip, self.step, self.sentence_size))
            if self.step == 0:
                return None
            self.skip = 0
            self.step = int(self.step / 2)
        if self.step + self.skip < self.sentence_size:
            random_step = random.randint(0, self.step)
            # print("获取 skip:%d" % self.skip+random_step)
            pipeline = [{"$skip": self.skip + random_step}, {"$limit": 1}]
            self.skip += random_step
            docs = list(self.db.aggregate(pipeline))
            doc = docs[0] if len(docs) > 0 else None
            self.db.update({"_id": doc["_id"]}, {"$inc": {"analysed": 1}})
            time_counter("已获取到")
            return doc
        else:
            return None

    def read_sentence_from_remote(self):
        db = self.db
        return db.find()
Exemple #32
0
def load_tags(size, start):
    reviews_collection = MongoClient(Settings.MONGO_CONNECTION_STRING)[
        Settings.BUSINESS_DATABASE][Settings.REVIEWS_COLLECTION]
    tags_collection = MongoClient(Settings.MONGO_CONNECTION_STRING)[
        Settings.BUSINESS_DATABASE][Settings.TAGS_REVIEWS_COLLECTION]

    stopwords = nltk.corpus.stopwords.words('english')
    puncs = set(string.punctuation)

    batch_size = 1000
    for batch in range(0, size, batch_size):
        reviews_cursor = reviews_collection.find().skip(start + batch).limit(
            batch_size)
        for review in reviews_cursor:
            words = []
            sentences = nltk.sent_tokenize(review["text"].lower())

            for sentence in sentences:
                tokens = nltk.word_tokenize(sentence)
                content = [
                    token for token in tokens
                    if token not in stopwords and token not in puncs
                ]
                tags = nltk.pos_tag(content)

                for word, tag in tags:
                    words.append({"word": word, "pos": tag})

            tags_collection.insert({
                "review_id": review["review_id"],
                "business_id": review["business_id"],
                "text": review["text"],
                "words": words
            })
def cal_dimensions():#计算各个维度
    conn=MongoClient("192.168.4.250",27017)['linkedin']['linkedin_userinfo']
    write_location = MongoClient("192.168.2.254", 27017)['shulianxunying']['linkedin_dimension']
    count = 0
    logfd=open("log_linkedin.txt",'w')
    for cv in conn.find(timeout=False):
    	post={}
        count += 1
        if count % 1000 == 0:
            print count
        BasicInfo_industry_highlight=''
        BasicInfo_industry_highlight=''
        if 'BasicInfo' in cv:
            if "location_highlight" in cv['BasicInfo']:
                BasicInfo_location_highlight=cv['BasicInfo']['location_highlight']
            if "industry_highlight" in cv["BasicInfo"]:
                BasicInfo_industry_highlight=cv['BasicInfo']['industry_highlight']
        positionsMpr=[]
        if 'positionsMpr' in cv:
            positionsMpr=cv['positionsMpr']
        educationsMpr=[]
        if 'educationsMpr' in cv:
            educationsMpr=cv['educationsMpr']
        skillsMpr=[]
        if 'skillsMpr' in cv:
            skillsMpr=cv['skillsMpr']
        honorsMpr=[]
        if 'honorsMpr' in cv:
            honorsMpr=cv['honorsMpr']
        projectsMpr=[]
        if 'projectsMpr' in cv:
            projectsMpr=cv['projectsMpr']
        patentsMpr=[]
        if 'patentsMpr' in cv:
            patentsMpr=cv['patentsMpr']
        #计算工作地点
        location=''
        location=cal_location(BasicInfo_location_highlight,positionsMpr,educationsMpr)
        if location:
            post['location']=location
        else:
            post['location']='null'
        #计算职业方向
        job=''
        job=cal_job(positionsMpr,skillsMpr,BasicInfo_industry_highlight)
        if job:
            post['job']=job
        else:
            post['job']='null'
        #计算专业能力
        ability=0.
        ability=major_ability(job,skillsMpr,positionsMpr,educationsMpr,honorsMpr,projectsMpr,patentsMpr)
        if ability:
            post['ability']=ability
        else:
            post['ability']=0.
        #职业性格(暂时无法刻画)
        post['character']='null'
        post['uid']=cv['_id']
        write_location.insert(post)
def gen_review_coll_with_id():
    """
    The function is used to add a sequential id to the records within the review2 collection. This id was required to
    process the records in parallel when calculating the sentiment. This collection will be used by the code
    "ParallelProcess.py". The column is added and the record is written to a new collection: review_counter
    :return: None
    """
    review2_collection = MongoClient('localhost', 29017).yelp.review2
    review2_cursor = review2_collection.find()
    client = MongoClient('localhost', 29017)
    db = client.yelp
    review_counter = db.review_counter
    counter = 1
    for entry in review2_cursor:
        business_id = entry["business_id"]
        text = entry["text"]
        stars = entry["stars"]
        review_id = entry["review_id"]
        user_id = entry["user_id"]

        _dict = {"business_id": business_id, "text": text, "stars": stars, "review_id": review_id, "user_id": user_id,
                 "counter": counter}
        review_counter.insert(_dict)
        counter += 1

        if counter % 100 == 0:
            print counter
Exemple #35
0
def get_all_from_mongo(dataset):
    cl = MongoClient('localhost', 27018)
    cl = cl['lSSVM']['base']
    exps = []
    for meta in cl.find({'dataset_name': dataset}):
        exps.append(meta)
    return exps
Exemple #36
0
def loop():
    p = []
    while True:
        p.append(run_osrm(osrm_port.value + 1 - flag.value))
        flag.value = 1 - flag.value
        if len(p) > 1:
            p[0].terminate()
            p = p[1:]

        orders = MongoClient()[db_name][order['collection']['name']]
        _orders = orders.find({})  # bad smell <-
        r = [[
            *o['receiver']['coordinates'],
            time.mktime(o['timeline']['init']['at'].timetuple())
        ] for o in _orders]
        rm = Mu(2)
        if False:
            rm.extend(np.array(r))
        t = [[
            *o['transmitter']['coordinates'],
            time.mktime(o['timeline']['init']['at'].timetuple())
        ] for o in _orders]
        tm = Mu(2)
        if False:
            tm.extend(np.array(t))
        mus.extend([tm, rm])
        while len(mus) > 2:
            mus.pop(0)
            mus.pop(0)
        time.sleep(60 * 60)
Exemple #37
0
def run(host, database, collection, field=None, value=None):
    graph = MongoClient(host)[database][collection]

    return bson.json_util.dumps(
        graph.find({"data.%s" % (field): {
                        "$in": [value]
                    }}))
Exemple #38
0
def save_mongo(item):
    conn = MongoClient(host='localhost', port=27017)['db_newGroup']['heneng']
    count = conn.find({'source_url': item['source_url']}).count()
    if count == 0:
        conn.insert(item)
    else:
        print('已存在')
Exemple #39
0
class DB:
    def __init__(self):
        self.collection = MongoClient().local.connections

    def REMOVEALL(self):
        self.collection.delete_many({})

    def remove(self, connection, field):
        if ("ip" in field):
            self.collection.delete_many({"ip": connection.ip})
        elif ("hostname" in field):
            self.collection.delete_many({"hostname": connection.hostname})
        else:
            return False
        return True

    def insert(self, connection):
        data = json.dumps(connection.socket, -1)
        self.collection.insert_one({
            "ip": connection.ip,
            "hostname": connection.hostname,
            "uniq": str(connection.unique)
        })

    def getAllConnectionsPrint(self):
        darr = []
        docs = self.collection.find()
        for doc in docs:
            darr.append(doc)
        return darr

    def getCollection(self):
        return self.collection
Exemple #40
0
 def show(self):
     """
     为了界面展示
     :return:
     """
     self.all_user = random.sample(self.all_user, 500)
     self.all_wblog = random.sample(self.all_wblog, 500)
     for uid in self.all_user:
         self.retweet_edge[uid] = []
         for res in self.sqlhelper.select_sql(
                 'SELECT paMid, orMid FROM wblog WHERE uid=%s' % uid):
             paMid = str(res[0])
             orMid = str(res[1])
             if paMid in self.all_wblog:
                 self.retweet_edge[uid].append(paMid)
             if orMid in self.all_wblog:
                 self.retweet_edge[uid].append(orMid)
     mdb = MongoClient().comment.comment
     for wblogId in self.swblog:
         for res in mdb.find({'wblogId': wblogId}):
             try:
                 uid = res['json_text']['user']['id']
                 if uid in self.retweet_edge.keys():
                     if wblogId not in self.retweet_edge[uid]:
                         self.retweet_edge[uid].append(wblogId)
             except Exception as e:
                 logging.error('%s. The wblogId is %s' % (e, str(wblogId)))
Exemple #41
0
def people(bot, update, args):
    limit = 5
    chat_id = update.message.chat_id

    collection = MongoClient('mongodb://localhost:27017/')['jacobs']['jpeople']

    query_str = " ".join(args)
    print(f'query for people: {query_str}')

    results = collection.find({'$text': {'$search': query_str}})

    # number of results
    reply_count = 0

    msgs = []
    for result in results:
        if reply_count >= limit:
            break

        first_name = result['firstName']
        last_name = result['lastName']
        email = result['email']
        year = result['year']
        major = result['majorShort']
        college = result['college']
        room = result['room']
        country = result['country']

        reply_str = f'Name: {first_name} {last_name}\n'
        reply_str += f'Email: {email}, \nYear: {year},\nMajor: {major}, \nCountry: {country}\n'
        reply_str += f'College: {college},\nRoom: {room}'
        msgs.append(reply_str)

    for msg in msgs:
        bot.send_message(chat_id=chat_id, text=msg)
 def _getDocById(self, collObj: MongoClient, userId):
     """Returns collection object based on the UUID -- returns empty dict for non-existant user"""
     match = list(collObj.find(
         {"id": userId}))  # "match" because there should only ever be one
     numMatches = len(match)
     if numMatches > 0: return match[0]
     else: return {}
Exemple #43
0
def email_birthdate():
    """Busca usuarios que hayan nacido entre dos fechas
    """
    
    # http://localhost:8080/find_email_birthdate?from=1973-01-01&to=1990-12-31

    ini = request.query['from']
    fin = request.query['to']
    
    print (ini, fin)
    
    dicc = dict(request.query)
    consulta={}
    
    for x in dicc:
        if x == "from":
            consulta["from"]= ini
        elif x =="to":
            consulta["to"]= fin
        else:
            return template('error.tpl', error="Los parametros introducidos son incorrectos")
    
    collection = MongoClient('localhost',27017).giw.usuarios    
    
    query = dict({"birthdate": {"$gte": consulta["from"], "$lt": consulta["to"]}})
    
    cursor = collection.find(query, {"id":1, "email":1, "birthdate":1})

    count = make_table_three_columns(cursor, "email_birthdate_result")
    return template('email_birthdate_result.tpl', usuarios = count, fecha1 = ini, fecha2 = fin)
class DisIO:
    def __init__(self):
        self.db = MongoClient('localhost', 20000).get_database('orig').get_collection('sentences')

    def sen_from_mongo(self):
        cursor = self.db.find({})
        str = ""
        for sen in cursor:
            str = str + sen['text']
        return str

    def re_to_text(self, path, cut=[]):
        jieba_sum = 0.0
        thulac_sum = 0.0
        dis = open(path, 'a', encoding='utf-8')
        length = len(cut)
        for i in range(0, length):
            jieba_sum += cut[i]["jieba_overlap"]
            thulac_sum += cut[i]["thulac_overlap"]
            dis.write("origin: " + cut[i]["sentence"] + "\n")
            dis.write("result: " + str(cut[i]["result"]) + "\n")
            dis.write("jieba:  " + str(cut[i]["jieba"]) + "  " + str(cut[i]["jieba_overlap"]) + "\n")
            dis.write("thulac: " + str(cut[i]["thulac"]) + "  " + str(cut[i]["thulac_overlap"]) + "\n\n")
        dis.write(
            "jieba:" + "n/a" if length == 0 else str(jieba_sum / length) + "  thulac:" + "n/a" if length == 0 else str(
                thulac_sum / length) + "\n")
        dis.close()
Exemple #45
0
def find_likes_not_ending():
    """Busca usuarios que no tienen aficiones que acaben en con un sufijo concreto
    """
    
    # http://localhost:8080/find_likes_not_ending?ending=s
    
    ending = request.query.ending
    
    print (ending)
    
    dicc = dict(request.query)
    
    for x in dicc:
        if x == "ending":
            consulta = ".*"+ending.lower()
        else:
            return template('error.tpl', error="Los parametros introducidos son incorrectos")
    
    query = dict({"likes": {"$not": {"$elemMatch": {"$regex": consulta}}}})
        
    collection = MongoClient('localhost',27017).giw.usuarios 
    
    cursor = collection.find(query)

    count = make_table_ten_columns(cursor, "find_likes_not_ending_result")
    return template('find_likes_not_ending_result.tpl', usuarios=count, ending=ending)
class CorpusIO:
    def __init__(self):
        self.db = MongoClient(
            'localhost',
            27017).get_database('chinese').get_collection('train_edges')

    def read_from_mongo(self, limit=20):
        cursor = self.db.find({})
        cnt = 0
        for doc in cursor:
            if limit is not None and cnt > limit:
                break
            cnt += 1
            if cnt % 10000 == 0:
                print(cnt)
            edge = (doc['src_name'], doc['des_name'], doc['weight'])
            yield edge

    def save_as_json(self, corpus_json, path):
        file = open(path, 'w', encoding='utf-8')
        # pickle.dump(corpus_json, file)
        json.dump(corpus_json, file)
        print('corpus network saved to %s' % path)

    def load_as_json(self, path):
        file = open(path, 'r', encoding='utf-8')
        # json = pickle.load(file, encoding='utf-8')
        corpus_json = json.load(file)
        return corpus_json
Exemple #47
0
	def test_iteration(self, collection, num_topics, file):
		dbcollections = MongoClient().twitter[collection]

		tweets = []
		count = 0
		unique_hashtags = set()
		allcount = 0
		for tweet in dbcollections.find({}, {"_id": 1, "entities": 1, "text": 1}):
			allcount += 1
			if not tweet.has_key('text'):
				continue

			hashtags = [] 
			if tweet.has_key('entities') and tweet['entities'].has_key('hashtags')  > 0:
				for i in range(len(tweet['entities']['hashtags'])):
					atag = tweet['entities']['hashtags'][i]['text']
					hashtags.append(atag)
					unique_hashtags.add(atag)

				if len(hashtags) == 0:
					continue

				tweets.append((tweet['text'], tweet['_id'], hashtags))

				count += 1
				if count == 10000:
					break;
Exemple #48
0
class EventsTestMixin(object):
    """
    Helpers and setup for running tests that evaluate events emitted
    """
    def setUp(self):
        super(EventsTestMixin, self).setUp()
        self.event_collection = MongoClient()["test"]["events"]
        self.event_collection.drop()
        self.start_time = datetime.now()

    def assert_event_emitted_num_times(self, event_name, event_time, event_user_id, num_times_emitted):
        """
        Tests the number of times a particular event was emitted.
        :param event_name: Expected event name (e.g., "edx.course.enrollment.activated")
        :param event_time: Latest expected time, after which the event would fire (e.g., the beginning of the test case)
        :param event_user_id: user_id expected in the event
        :param num_times_emitted: number of times the event is expected to appear since the event_time
        """
        self.assertEqual(
            self.event_collection.find(
                {
                    "name": event_name,
                    "time": {"$gt": event_time},
                    "event.user_id": int(event_user_id),
                }
            ).count(), num_times_emitted
        )
Exemple #49
0
def find():
    conn=MongoClient("192.168.4.249",27017)['shulianxunying']['combine_v4_dimension']#修改源数据库地址
    max_worktime=0
    max_changefreq=0
    max_experience=0
    max_ability=0
    min_refresh=10000
    count=0
    fd=open("log.txt",'w')
    fd.write("experience,changefreq,worktime,ability,refresh,uid\n")
    for cv in conn.find(timeout=False):
        fd.write(str(cv['experience'])+","+str(cv['changefreq'])+","+str(cv['worktime'])+","+str(cv['ability'])+","+str(cv['refresh'])+","+cv['uid']+"\n")
    '''
        if cv['experience']>max_experience:
            max_experience=cv['experience']
            if cv['experience']==23856:
                print "max_experience",cv['uid']
        if cv['changefreq']>max_changefreq:
            max_changefreq=cv['changefreq']
            if cv['changefreq']==17.1534701857:
                print "max_changefreq",cv['uid']
        if cv['worktime']>max_worktime:
            max_worktime=cv['worktime']
            if cv['worktime']==119280:
                print "worktime",cv['uid']
        if cv['ability']>max_ability:
            max_ability=cv['ability']
        if cv['refresh']>0 and cv['refresh']<min_refresh:
            min_refresh=cv['refresh']
            if cv['refresh']==59:
                print "min_refresh",cv['uid']
    print "max_worktime,max_changefreq,max_experience,max_ability,min_refresh"
    print max_worktime,max_changefreq,max_experience,max_ability,min_refresh
    '''
    fd.close()
Exemple #50
0
def get_sample_reviews(ratings=None, size=None):
    """
    Function used to query the Mongo Collection for a particular rating and number of records
    and write the review text and rating to a JSON file.

    :param ratings: Review rating in the Mongo Collection
    :param size: Number of records to be retrieved
    :return: None
    """
    combined_collection = MongoClient('localhost',
                                      29017).yelp.RestaurantReviews
    file_name = "Ratings.json"
    ratings_file = open(file_name, 'a')
    if ratings and size:
        combined_cursor = combined_collection.find({
            "stars": ratings
        }, {
            "text": 1,
            "stars": 1,
            "_id": 0
        }).limit(size)
        for entry in combined_cursor:
            text = entry["text"]
            stars = entry["stars"]
            tmp = ' '.join(text.split())
            obj = SampleEntry(tmp, stars)
            ratings_file.write(json.dumps(vars(obj)))
            ratings_file.write("\n")
    ratings_file.close()
Exemple #51
0
def player():
    logging.basicConfig(level=logging.INFO)
    with open(PID_FN, "w") as f:
        pid = getpid()
        f.write(str(pid))
    coll = MongoClient()["for-music-player"].queue

    i = 0
    while True:
        i += 1
        if not coll.count_documents(filter=FILTER):
            print(f"> queue empty. wait {WAIT_SEC} sec...")
            # exit()
            sleep(WAIT_SEC)
        elif MAX_ITERATION_NUM >= 0 and i >= MAX_ITERATION_NUM:
            exit()
        else:
            objs = coll.find(filter=FILTER, sort=[("date", 1)])
            obj = objs[0]
            f, fn = mkstemp(suffix=".mp3")
            coll.update_one({"_id": obj["_id"]},
                            {"$set": {
                                "start": datetime.now()
                            }})
            myexec(f"wget \"{obj['path']}\" -O \"{fn}\"")
            myexec(f"{PLAY_AUDIO_COMMAND} \"{fn}\"")
            close(f)
            coll.update_one({"_id": obj["_id"]},
                            {"$set": {
                                "played": True,
                                "end": datetime.now()
                            }})
Exemple #52
0
def validate_all_human_protein():
    # runs all proteins through the validator
    # and generates a log file

    coll = MongoClient().wikidata_src.mygene
    metadata_coll = MongoClient().wikidata_src.mygene_sources
    metadata = metadata_coll.find_one()
    doc_filter = {'taxid': 9606, 'entrezgene': {'$exists': True}}
    docs = coll.find(doc_filter)
    print("total number of records: {}".format(coll.find(doc_filter).count()))

    validate_type = 'eukaryotic'
    docs = HelperBot.validate_docs(docs, validate_type, 'P351')
    records = HelperBot.tag_mygene_docs(docs, metadata)

    _ = list(records)
Exemple #53
0
def main():
	col = MongoClient()["tubules"]["members"]

	with open("22nd feb.csv", "r") as in_file:
		data = csv.reader(in_file)

		for index, row in enumerate(data):
			print("{}/{}".format(index, N_ROWS))
			scraped_data = gather_information(row)
			if scraped_data["success"]:
				r = col.insert_one(scraped_data["data"])
				print(r.inserted_id)

			elif scraped_data["error_code"] == 1:
				print(scraped_data["message"])
				print(scraped_data["data"])
				in_db = [reg["Registration Number"] for reg in col.find({"Registration Number": {"$in": scraped_data["data"]}}, {"Registration Number": 1})]
				print(in_db)
				if len(in_db) < len(scraped_data["data"]):
					responses = multiple_regs([regs for regs in scraped_data["data"] if regs not in in_db])
					try:
						r = col.insert_many([response["data"] for response in responses if response["success"]])
						print(r.inserted_ids)
					except Exception as e:
						print(e.args)
				else:
					print("None saved to Database")
			else:
				print(scraped_data["message"])		
Exemple #54
0
def parent():
	SLEEP = 10
	p = MongoClient().client["MP"].p
	p.drop(); p.insert_one({"_id": 1}); p.insert_one({"_id": 2})
	isParent = True

	newpid1 = os.fork()
	# We are the child
	if newpid1 == 0:
		isParent = False
		child1()
		p = MongoClient().client["MP"].p; p.remove({"_id": 1})
	# We are the parent
	else:
		newpid2 = os.fork()
		# We are the child
		if newpid2 == 0:
			isParent = False
			child2()
			p = MongoClient().client["MP"].p; p.remove({"_id": 2})

	if not isParent:
		print "PROCESS FINISHED"
	else:
		wait = True
		while wait:
			ps = p.find({})
			wait = False if ps.count() == 0 else True
			if wait:
				print "MAIN PROCESS WAITING: %i" % ps.count()
				time.sleep(SLEEP)

		print "MAIN PROCESS FINISHED"
Exemple #55
0
class EventsTestMixin(object):
    """
    Helpers and setup for running tests that evaluate events emitted
    """
    def setUp(self):
        super(EventsTestMixin, self).setUp()
        self.event_collection = MongoClient()["test"]["events"]
        self.event_collection.drop()
        self.start_time = datetime.now()

    def assert_event_emitted_num_times(self, event_name, event_time, event_user_id, num_times_emitted):
        """
        Tests the number of times a particular event was emitted.
        :param event_name: Expected event name (e.g., "edx.course.enrollment.activated")
        :param event_time: Latest expected time, after which the event would fire (e.g., the beginning of the test case)
        :param event_user_id: user_id expected in the event
        :param num_times_emitted: number of times the event is expected to appear since the event_time
        """
        self.assertEqual(
            self.event_collection.find(
                {
                    "name": event_name,
                    "time": {"$gt": event_time},
                    "event.user_id": int(event_user_id),
                }
            ).count(), num_times_emitted
        )
Exemple #56
0
def run(host=None, db=None, coll=None, node=None, outgoing="true", incoming="true", undirected="true", offset=0, limit=0):
    # Connect to the mongo collection.
    graph = MongoClient(host)[db][coll]

    outgoing = json.loads(outgoing)
    incoming = json.loads(incoming)
    undirected = json.loads(undirected)

    offset = int(offset)
    limit = int(limit)

    # Construct the query according to the given options.
    query = {"type": "link"}
    clauses = []
    oid = ObjectId(node)
    if outgoing or incoming:
        dirclauses = []
        orclause = {"$or": [{"undirected": {"$not": {"$exists": 1}}},
                            {"undirected": False}]}
        if outgoing:
            dirclauses.append({"source": oid})

        if incoming:
            dirclauses.append({"target": oid})

        clauses.append({"$and": [orclause, {"$or": dirclauses}]})

    if undirected:
        clauses.append({"$and": [{"undirected": True},
                                 {"$or": [{"source": oid},
                                          {"target": oid}]}]})

    query["$or"] = clauses

    return bson.json_util.dumps(list(graph.find(query, skip=offset, limit=limit)))
Exemple #57
0
def getDoubanBasic(doubanID):
    coll = MongoClient()[DB][DoubanBasic]
    cur = coll.find({'id': doubanID})
    if cur.count() > 0:
        return cur[0]
    else:
        return None
Exemple #58
0
def progress(db_url, session, tasks_per_stage):

    # Find the db_url

    if not db_url:
        try:
            db_url = os.environ['RADICAL_PILOT_DBURL']
        except KeyError:
            rp = json.load(open(os.path.join(session, session+".json")))
            db_url = rp['session']['cfg']['dburl']

    db = db_url.split('/')[-1]

    collection = MongoClient(db_url)[db][session]

    cursor = collection.find()
    count = [(unit['state'] == 'DONE') for unit in cursor if unit['type'] == 'unit']

    if len(count) == 0:
        click.echo('There are no units in the session.')
        return

    if tasks_per_stage == -1:
        tasks_per_stage = len(count)

    stage, completed = divmod(sum(count), tasks_per_stage)
    percentage = round(completed/tasks_per_stage * 100, 2)

    if sum(count) == len(count):
        # If all the tasks finished then the above gives incorrect result.
        stage -= 1
        completed = tasks_per_stage
        percentage = 100

    click.echo("Stage {} progress: {}/{} ({}%)".format(stage, completed, tasks_per_stage, percentage))
Exemple #59
0
def retrieve_zones():
    """Retrieve (geo)zones with a `flag` or a `blazon` value."""
    zones = MongoClient().geozones.geozones
    return zones.find({'$or': [
        {'flag': {'$exists':  True}},
        {'blazon': {'$exists':  True}}
    ]})
Exemple #60
0
class Ticker(object):
    def __init__(self, api, interval=1):
        self.api = api
        self.db = MongoClient().poloniex['ticker']
        self.interval = interval

    def updateTicker(self):
        tick = self.api.returnTicker()
        for market in tick:
            self.db.update_one({'_id': market}, {'$set': tick[market]},
                               upsert=True)
        logger.info('Ticker updated')

    def __call__(self):
        return list(self.db.find())

    def run(self):
        self._running = True
        while self._running:
            self.updateTicker()
            sleep(self.interval)

    def start(self):
        self._thread = Thread(target=self.run)
        self._thread.daemon = True
        self._thread.start()
        logger.info('Ticker started')

    def stop(self):
        self._running = False
        self._thread.join()
        logger.info('Ticker stopped')