def getListQuery(lis,dbase,coll): from pymongo import MongoClient as mc from collections import Counter check = 0 client = mc() if dbase == "testIndex": client = mc("192.168.103.25") if dbase=="wiki": return wiki.main(lis) db = client[dbase] results=db[coll].find({"keyword":{"$in":lis}},{"keyword":0,"_id":0}) listCounters = list(results) if dbase == "news" or dbase == "youtube": if len(lis) == len(listCounters): return list(getNewContent(listCounters)) else: check =1 result=SumDict({}) if check ==1: for listCounter in listCounters: result.merge(listCounter) sor=sorted(result, key=result.get, reverse=True) return sor else: return []
def getNews(): client = mc(host) collection = client['mydb']['nba_news'] lines = open('links.tmp', 'r').readlines() toggle, title, link = True, None, None for l in lines: if toggle: title = l.strip() else: link = l.strip() req = requests.get('{}/{}'.format(head, link)) page = soup(req.text, 'html.parser') section = page.find('section') section = '<html><body>{}</body></html>'.format(str(section)) article = soup(section, 'html.parser').find_all('p') content = ''.join([p.text for p in article]) print title, link, content doc = { "title": title, "link": '{}/{}'.format(head, link), "content": content.replace("\"", "\'") } collection.insert_one(doc) toggle = not toggle print collection.count()
def conn_db(host, port, db_name): connection = mc(host, port) db = connection[db_name] # MongoDB creates a collection implicitly when the collection is first referenced in a comman # So we hardcode a collection name here #collection = db["ibmdocs"] return db
def db_init(self): client = mc('127.0.0.1', 27017) self.db1 = client['review']['douban'] self.db1.ensure_index([('href', pymongo.ASCENDING)], unique=True, dropDups=True) self.db2 = client['review']['all_book_id']
def gallery_spider(url,type): import urllib.request as fetch import re from pymongo import MongoClient as mc import urllib response=fetch.urlopen(url) response=response.read().decode('utf8') pattern=re.compile('<div class="roomblock.*?">(.*?)</div>',re.S) results=re.findall(pattern,response) ikea='http://www.ikea.com' i=0 client=mc() db=client.IKEA gallery_url=db.GalleryUrl gallery_url.update_many({},{'$set':{'iscrawled':0}}) for room in results: #print(room) pattern_url=re.compile('<a title=".*?" href="(.*?)">',re.S) pattern_img=re.compile('<img alt=".*?" title=".*?" src="(.*?)".*?>') result_url=re.findall(pattern_url,room) result_img=re.findall(pattern_img,room) if(result_url): room_url=ikea+result_url[0] gallery_url.insert_one({'type':type,'url':room_url,'iscrawled':0}) if(result_img): room_img=ikea+result_img[0] filename=type+'_'+str(i)+".jpg" urllib.request.urlretrieve(room_img, filename) i += 1 print(room_url,room_img)
def getNews(): client = mc(host) collection = client['mydb']['nba_news'] lines = open('links.tmp', 'r').readlines() toggle, title, link = True, None, None for l in lines: if toggle: title = l.strip() else: link = l.strip() req = requests.get('{}/{}'.format(head, link)) page = soup(req.text, 'html.parser') section = page.find('section') section = '<html><body>{}</body></html>'.format(str(section)) article = soup(section, 'html.parser').find_all('p') content = ''.join([ p.text for p in article ]) print title, link, content doc = { "title": title, "link": '{}/{}'.format(head, link), "content": content.replace("\"", "\'") } collection.insert_one(doc) toggle = not toggle print collection.count()
def save_db(self): """保存到数据库""" import time from pymongo import MongoClient as mc conn = mc(host=self.fundb['host'], port=self.fundb['port']) conn = conn.get_database(self.fundb['db']) conn = conn.get_collection(self.fundb['coll']) conn.update({'name': self.fiction['name']}, { '$set': { 'name': self.fiction['name'], 'author': self.fiction['author'], 'fromsite': self.fiction['fromsite'], 'url': self.fiction['url'], 'num': self.fiction['num'], 'update': int(time.time()) } }, upsert=True) for item in self.fiction['chapter']: if item['status']: conn.update({'name': self.fiction['name']}, { '$push': { 'chapter': { 'title': item['title'], 'url': item['url'], 'status': item['status'] } } })
def __getClient(): conn = mc("mongodb://localhost:27017/", username="******", password="******", authSource="chatbot_database", authMechanism="SCRAM-SHA-1") print(conn.list_database_names()) return conn["db"]
def save_data(self,db,table): """ save data to a mongodb database. """ try: from pymongo import MongoClient as mc conn = mc() # default host/port # TODO: add remaining database selection and data insertion. except: return False
def replace_docs(db_name, col_name, docs): client = mc(get_secret()) db = client.get_database(db_name) col = db[col_name] for doc in docs: thread_filter = {"thread_num": doc['thread_num']} col.find_one_and_replace(thread_filter, doc, upsert=True)
def get_db(): client=mc("mongodb://'%s':'%s'@clusterkamal-shard-00-00-henbx.mongodb.net:27017,clusterkamal-shard-00-01-henbx.mongodb.net:27017,clusterkamal-shard-00-02-henbx.mongodb.net:27017/test?ssl=true&replicaSet=clusterkamal-shard-0&authSource=admin%('myname','mypassword')" ) #client=mc() db=client.agra_base if db: print("Connected") return db
def __init__(self, host, port, db_name): try: self.client = mc(host, port) # self.client = mc('mongodb://192.168.3.123:27017') self.client.admin.command('ismaster') self.db = self.client[db_name] except Exception, e: print u'cannot connect database!', repr(e) sys.exit(0)
def connect(self): if self.tipo == 'mysql': import mysql.connector as my self.con = my.connect(**self.con_Data) self.error = my.Error elif self.tipo == 'mongo': from pymongo import MongoClient as mc client = mc(self.con_Data['server'], int(self.con_Data['port'])) db = client[self.con_Data['database']] self.con = db[self.con_Data['collection']]
def login(self, server="localhost", port=27017, user="", pwd=""): if server == "": server = self.conf["server"] port = int(self.conf["port"]) user = self.conf["user"] pwd = self.conf["pwd"] if user == "": uri = 'mongodb://{}:{}'.format(server, port) else: uri = 'mongodb://{}:{}@{}:{}'.format(user, pwd, server, port) self.client = mc(uri)
def insert_new_docs(db_name, col_name, docs): client = mc(get_secret()) db = client.get_database(db_name) col = db[col_name] requests = [] for doc in docs: thread_filter = {"thread_num": doc['thread_num']} requests.append(UpdateOne(thread_filter, {'$set': doc}, upsert=True)) col.bulk_write(requests)
def index(): # connect to mongoDB connection = mc('localhost', 27017) # attach to test database db = connection.test # get handle for names collection name = db.names # find a single document item = name.find_one() return '<b>Hello %s!</b>' % item['name']
def write_to_mongo(docs, collection, dup=False): assert docs and collection client = mc(mongo_host[random.randint(0, 2)]) database = client[db] #database.authenticate(username, password=username) collection = database[collection] count = 0 for doc in docs: if dup: try: collection.insert_one(doc) except pymongo.errors.DuplicateKeyError, e: print e elif collection.find_one({'md5': doc['md5']}) is None: collection.insert_one(doc)
def main_action(): client = mc() db = client.ITjuzi tobecaptured = db.ToBeCaptured pages = tobecaptured.find({'done': 0}) number = pages.count() events = db.InvestmentEvent for page in pages: time = 3 * random.random() sleep(time) pageid = page['pageid'] print('now fetching page ' + str(pageid)) data = investmentevent(pageid) for each in data: print(each) events.insert_one({'data': each}) tobecaptured.update_one({'pageid': pageid}, {'$set': {'done': 1}}) number -= 1 print(str(number) + '...left')
def write_to_mongo(docs, collection, dup=False): assert docs and collection client = mc(host) database = client[db] database.authenticate(username, password=username) collection = database[collection] count = 0 for doc in docs: if dup is True: collection.insert_one(doc) elif collection.find_one({'md5': doc['md5']}) is None: collection.insert_one(doc) print 'Inserted #%s...' % count count += 1 time.sleep(30) print collection.count() assert collection.count() == count
import urllib.request as request import re from pymongo import MongoClient as mc import datetime from time import sleep from random import random as r from eventlet.timeout import Timeout client = mc() db = client.GoRatings players = db.TopRatings.find({'isCrawled': 0}) games = db.GameResults ratings = db.TopRatings num = players.count() for player in players: sleep(4 * r()) player_name = player['name'] print(player_name) url = "http://www.goratings.org/players/" + str( player['playerid']) + ".html" original = request.urlopen(url) response = original.read().decode('utf8') print('there are something') pattern = re.compile('''<tr><td>(.*?)</td><td>(.*?)</td> <td>(.*?)</td> <td>(.*?)</td> <td><a href="(.*?)\.html">(.*?)</a></td> <td>(.*?)</td> <td><a href="http://www.go4go.net/go/games/sgfview/(.*?)">View game</a></td> </tr>''')
def get_db_client(): if 'dbcl' not in g: g.dbcl = mc('mongodb://10.131.65.27:27017/') return g.dbcl
##pithytimeout=0 from commands import getoutput as go print go("pip install sseclient") from sseclient import SSEClient import json import time from pymongo import MongoClient as mc mip = "" ##Your MongoDB Address here client = mc(mip,27017) #print client cores = {} ac = "" #->Get you token for this from Particle Build get_event = "BOYA_fridge" #this is the event we declared in "particle.publish" while True: try: messages = SSEClient( 'https://api.particle.io/v1/events/%s?access_token=%s' %(get_event,ac) ) #connecto the streaming https interface from particle, get the event for our token for msg in messages: #for each message that comes in try: foo = msg.data.replace("nan","NaN") #format NaN correctly for python total = json.loads(foo) #load the dataset as a JSON string data = json.loads(total['data']) #load the data payload as a JSON string data['time'] = time.time() #add a timestamp data['coreid'] = total['coreid'] #add the particle name try: #try to make an event index for the structure
son[key] = to_binary(value) elif isinstance(value, dict): son[key] = self.transform_incoming(value, collection) return son def transform_outgoing(self, son, collection): for (key, value) in son.items(): if isinstance(value, Binary) and value.subtype == 128: son[key] = from_binary(value) elif isinstance(value, dict): son[key] = self.transform_outgoing(value, collection) return son """ #client = mc() client = mc('agm2.local', 27017) mydb = client.tutorial up = mydb.topics """ # following is for insertion of new records topics = {"author": "Duke", "title" : "PyMongo 101", "tags" : ["MongoDB", "PyMongo", "Tutorial"], "date" : datetime.utcnow() } up_id = up.insert(topics) new_posts = [{"author": "Mike", "text": "Another post!",
# -*- coding: utf-8 -*- """ Created on 2017/10/28 下午12:57 @author: SimbaZhang """ import requests import time import re from utils.log import logger from pymongo import MongoClient as mc from bs4 import BeautifulSoup as bs client = mc('127.0.0.1', 27017) db = client['review']['all_book_id'] class BookId(object): def __init__(self): self.start_url = 'https://book.douban.com/tag/?view=type&icn=index-sorttags-hot' self.header1 = { 'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", } self.header2 = { 'Host': 'book.douban.com', 'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", } def get_book_type(self):
from pymongo import MongoClient as mc import re import urllib.request as fetch from time import sleep import random client=mc() db=client.IKEA gallery_gadget=db.GalleryGadget gallery_url=db.GalleryUrl tobe=gallery_url.find({'iscrawled':2}) #gallery_url.update_many({},{'$set':{'iscrawled':0}}) numbers=tobe.count() for each in tobe: print(numbers) sleep(random.random()*2) gadgat=[] url=each['url'] response=fetch.urlopen(url) response=response.read().decode('utf8') print(response) #pattern_gadget=re.compile('<a title=".*?" href="/cn/zh/catalog/products/(.*?)/">',re.S) pattern_gadget=re.compile('href="/cn/zh/catalog/products/(.*?)/',re.S) result=re.findall(pattern_gadget,response) print(result) if(result): for each in result: if each not in gadgat: gadgat.append(each) print(each) print(url)
def __init__(self): self.client = mc() self.db = self.client.scantool
from flask import render_template, flash from flask import request, redirect, url_for from flask_security import login_required from flask_login import logout_user from extension import (render_with_opt, get_category, app, SearchForm) from path_finder import playFinder from station import get_station from dbconfig import * from pymongo import MongoClient as mc bikeCol = get_station() client = mc('mongodb://150.95.204.252:27017') db = client.get_database('map_db') collection = db.get_collection('place') @app.route('/', methods=['GET', 'POST']) def main(): return redirect("/index/") @app.route('/index/', methods=['GET', 'POST']) def index(): locsearch = SearchForm() if request.method == "POST": try: src = request.form["src"] dest = request.form["dest"] print(src, dest) bike = playFinder(src.replace('대한민국 ', ''),
Created on 31/10/2014 @author: aurelio ''' import time as tm import re, sys import subprocess from pymongo import MongoClient as mc from operator import itemgetter inicio = tm.time() #port numbers for freeling in server port1 = 50005 port2 = 50006 client = mc('localhost', 27017) mydb = client.wiki col = mydb.pages topic = {"title" : 'Instituto de Crédito Oficial'} f = col.find_one(topic) if f and len(f['redirect']) > 10: redireccion = re.sub('#REDIRECT', '',f['redirect']).strip() topic = {"title" : redireccion} f = col.find_one(topic) if not f: print("no encontrado") sys.exit() elif not f or len(f['texto']) < 80: print("no encontrado")
def __init__(self): self.client = mc() self.db = self.client.nodeDB
from django.shortcuts import render from django.http import HttpResponse from datetime import datetime as T from pymongo import MongoClient as mc client = mc("mongodb://localhost:27017/myProject") db=client.nayan entries=db.entries hosts=db.hosts import smtplib gmail_user = '******' gmail_password = '******' def checkout(request): if request.method== 'POST': visitor=entries.find_one({'visitorName':request.POST.get('visitorName'),'checkoutTime':None}) if visitor==None: return render(request ,'home/msg.html',{'msg':"Not Found"}) checkoutTime=T.now() entries.update_one(visitor, {"$set": {'checkoutTime':checkoutTime}} ) to = visitor["visitorEmail"] subject = 'OMG Super Important Message' body1= f'hostName:{visitor["hostName"]}\nhostEmail:{visitor["hostEmail"]}\nhostPhoneNo:{visitor["hostPhoneNo"]}\nhostAdress:{visitor["hostAdress"]}\n' body2=f'visitorName:{visitor["visitorName"]}\nvisitorEmail:{to}\nvisitorPhoneNo:{visitor["visitorPhoneNo"]}\nvisitorAdress:{visitor["visitorAdress"]}\n' body3=f'checkinTime:{visitor["checkinTime"]}\ncheckoutTime:{checkoutTime}\n'
from pymongo import MongoClient as mc from datetime import datetime as dt import pandas as pd from pithy import * from datetime import datetime as dt #Log into database client = mc("radical-edward.princeton.edu", 47017) #Find All Distinct Core DS dist = client.mae221.lab5_1.distinct('coreid') print "These Photons have checked in with Big Brother:" count = 1 for d in dist: print "%i) %s" % (count, d) count += 1 print #You can make, instead, a list of your coreids here # dist = [] # dist.append('27001b000247343337373739') # dist.append('2f002a000247343339373536') #For each Photon #hours to check hr = 2 for core in dist:
from config import * import jieba as jb encode = 'utf-8' acam = ac.Automaton() user_dict = cc.open(user_dict_path, 'r', encode) special_words = [word.strip().encode(encode) for word in user_dict.readlines()] for idx, word in enumerate(special_words): acam.add_word(word, idx) acam.make_automaton() mongo = mc() db = mongo.Message msg_overdue = db.msg_overdue special_words_set = set(special_words) all_words = special_words_set.copy() def parse_group(documents, fout=None): print fout global all_words for doc in documents: str_text = doc['text'] words = [ special_words[item[1]] for item in acam.iter(str_text.encode(encode))
import time from numpy import * import json from commands import getoutput as go #Flask Imports from flask import Flask,request,Response,send_file from functools import wraps sets = json.load(open("settings.json")) mip = sets['mongo_ip'] port = sets['port'] cli = mc(mip) #Instantiate Server app = Flask(__name__) #This pulls the node data @app.route("/data/",methods=['POST']) def out(var=None): form = json.loads(request.get_data()) out = {} try: db = form['db'] col = form['col'] q = form['q'] #data['time'] = time.time() df = pd.DataFrame(list(cli[db][col].find(q)))
def getListQuery(lis,dbase,coll): from pymongo import MongoClient as mc from collections import Counter check = 0 client = mc() if dbase == "testIndex": client = mc("192.168.103.25") if dbase=="wiki": def wikiMain(): global wikiAns,wikiAnsGot,check db = MySQLdb.connect("localhost","root","","wiki" ) cursor = db.cursor() #print lis if len(lis)>1: sql = """SELECT json_data from wordmatrix where word IN %s"""%(str(tuple(lis))) elif len(lis)==1: sql = """SELECT json_data from wordmatrix where word ='%s'"""%(str(lis[0])) try: # Execute the SQL command cursor.execute(sql) #print sql # Commit your changes in the database db.commit() res = cursor.fetchall() listCounters=[] listCounter=[] ke=set([])#stores common keys first=True #Storing common keys in ke for r in res: d=json.loads(r[0].replace("#dot#",".")) #print d if first: ke=set(d.keys()) first=False else: ke=set(d.keys()).intersection(set(ke)) listCounters.append(d) if len(list(ke))==0: wikiAns=[]#no results got wikiAnsGot=True for l in listCounters: d={}#creating dict from common keys for k in list(ke): d[k]=l[k] listCounter.append(d) check =1#no idea why prashant used this result=SumDict1({})#Sum of dicts in listCounters will be stored here if check ==1: for li in listCounter: result.merge(li) #print result global sor sor=sorted(result, key=result.get, reverse=True) #if we already have 1 for 1 keyword and title is 2 keyword #ex sachin tendulkar we got Sachin:1 but not right results #right on is Sachin:0s.5 Tendulkar:0.5 #below for loop for doing exact match for ind,term in enumerate(sor): #remove punctuataion ex ajay k. sood vs ajay k sood t=term.lower()#remove_punctuations(term)# this does not remove hash tags redirect=False if "#r#" in t and t.replace("#r#","")=="_".join(lis): #This condition is for checking if user query matches to a redirect page resolved=resolveRedirects(term.replace("#r#","")) sor[ind],sor[0]=sor[0],resolved wikiAns=[sor[0]] wikiAnsGot=True break elif t.lower()=="_".join(lis): #Not a redirect but a exact match sor[ind],sor[0]=sor[0],sor[ind] b=checkForDis(sor) wikiAns=[b] wikiAnsGot=True break #did'nt got exact match just return 1 #Exact match with user query not found but found something which adds upto 1 if result[sor[0]]==1.0: #print result[sor[0]] if "#r#" in sor[0]: resolved=resolveRedirects(sor[0].replace("#r#","")) sor[0]=resolved wikiAns=[sor[0]] wikiAnsGot=True else: b=checkForDis(sor) wikiAns=[b] wikiAnsGot=True else: rnks=compareRanks(sor) if not len(rnks)==0: if "#r#" in rnks[0]: wikiAns=[resolveRedirects(rnks[0].replace("#r#",""))] wikiAnsGot=True else: wikiAns=[rnks[0]] wikiAnsGot=True else: if "#r#" in sor[0]: wikiAns=[resolveRedirects(sor[0].replace("#r#",""))] wikiAnsGot=True else: b=checkForDis(sor) wikiAns=[b]#worst result not exact match not ranked wikiAnsGot=True else: wikiAns=[] wikiAnsGot=True except Exception as e: #print e # Rollback in case there is any error db.rollback() # disconnect from server db.close() def waitForAns(): global wikiAnsGot while not wikiAnsGot: pass q="".join(lis) th1=Process(target=checkInCache,args=(q,)) th1.start() th2=Process(target=wikiMain,args=()) th2.start() tt=Process(target=waitForAns,args=()) tt.start() tt.join() return wikiAns #below portion is shared by both news and general db = client[dbase] results=db[coll].find({"keyword":{"$in":lis}},{"keyword":0,"_id":0}) listCounters = list(results) if dbase == "news" : #print len(listCounters),len(lis),lis if len(lis) == len(listCounters): #check =1 #print list(getNewContent(listCounters)) return list(getNewContent(listCounters)) else: check =1 result=SumDict({}) if check ==1: for listCounter in listCounters: result.merge(listCounter) #print result sor=sorted(result, key=result.get, reverse=True) #print sor return sor else: return []
def xlsx(): p = xl.load_workbook("Brand.xlsx") ob = p.active conn = mc('localhost', 27017) db = conn.Medicine.THCs db1 = conn.Medicine.SubTHCs db2 = conn.Medicine.manufacturer db3 = conn.Medicine.MedBrands db4 = conn.Medicine.medicine for i in range(2, 320798): on = ob.cell(row=i, column=1) off = ob.cell(row=i - 1, column=1) if on.value != off.value: path = { "Name": on.value, "Description": " ", "created": dt.datetime.utcnow(), "LastModified": dt.datetime.utcnow() } db.insert_one(path) on1 = ob.cell(row=i, column=2) off1 = ob.cell(row=i - 1, column=2) if on1.value != off1.value: path = { "Thcs": on.value, "SubThcs": on1.value, "Description": " ", "created": dt.datetime.utcnow(), "LastModified": dt.datetime.utcnow() } db1.insert_one(path) on2 = ob.cell(row=i, column=7) off2 = ob.cell(row=i - 1, column=7) if on2.value != off2.value: path = { "ManufacturerName": on2.value, "Address": " ", "created": dt.datetime.utcnow(), "LastModified": dt.datetime.utcnow() } db2.insert_one(path) on32 = ob.cell(row=i, column=3) off32 = ob.cell(row=i - 1, column=3) on31 = ob.cell(row=i, column=6) off31 = ob.cell(row=i - 1, column=6) if on31.value != off31.value and on32.value != off32.value: path = { "ManufactName": on2.value, "Composition": on31.value, "MedicineBrand": on32.value, "Description": " ", "created": dt.datetime.utcnow(), "LastModified": dt.datetime.utcnow() } db3.insert_one(path) on41 = ob.cell(row=i, column=4) off41 = ob.cell(row=i - 1, column=4) on42 = ob.cell(row=i, column=5) off42 = ob.cell(row=i - 1, column=5) on43 = ob.cell(row=i, column=9) off43 = ob.cell(row=i - 1, column=9) on44 = ob.cell(row=i, column=8) off44 = ob.cell(row=i - 1, column=8) if on41.value != off41.value and on42.value != off42.value and on43.value != off43.value and on44.value != off44.value: path = { "SubThc": on1.value, "medForm": on42.value, "MedicinBrand": on32.value, "HsnCode": on41.value, "MaxRP": on43.value, "PrimaryPack": on44.value, "SecondaryPack": " ", "TertiaryPack": " ", "created": dt.datetime.utcnow(), "LastModified": dt.datetime.utcnow() } db4.insert_one(path)
def getListQuery(lis1,dbase,coll): from pymongo import MongoClient as mc from collections import Counter check = 0 client = mc() if dbase == "newIndex": client = mc("192.168.103.59") if dbase=="wiki": def wikiMain(): global wikiAns,wikiAnsGot,check,lis lis=lis1 db = MySQLdb.connect("localhost","root","#srmseONserver1","wiki" ) cursor = db.cursor() #print lis #length are used to check to support the string format of tuple as len 1 tuple is (key,) the comma creates problem with IN query in mysql if len(lis)>1: sql = """SELECT `word`,`json_data` from wordmatrix where word IN %s"""%(str(tuple(lis))) elif len(lis)==1: sql = """SELECT `word`,`json_data` from wordmatrix where word ='%s'"""%(str(lis[0])) try: cursor.execute(sql) #print sql db.commit() res = cursor.fetchall() listCounters=[]#initially holds all the the data of keys listCounter=[]#holds the dics obtained from common keys ke=set([])#stores common keys first=True loadedCache={}#stores loaded data from json.loads for r in res: d=json.loads(r[1].replace("#dot#",".")) loadedCache[r[0]]=d if first: ke=set(d.keys()) first=False else: ke=set(d.keys()).intersection(set(ke)) listCounters.append(d) if len(list(ke))==0: #"got no common keys trying to remove stop words" words=removeStopWords(" ".join(lis)) q=" ".join(words) global lis lis=words th2=Process(target=checkInCache,args=(q,)) th2.start() listCounters=[] listCounter=[] ke=set([])#stores common keys first=True #Storing common keys in ke for r in res: if r[0] in words: d=loadedCache[r[0]] if first: ke=set(d.keys()) first=False else: ke=set(d.keys()).intersection(set(ke)) listCounters.append(d) if len(list(ke))==0: wikiAns=[]#no results got wikiAnsGot=True for l in listCounters: d={}#creating dict from common keys for k in list(ke): d[k]=l[k] listCounter.append(d) check =1#no idea why prashant used this result=SumDict1({})#Sum of dicts in listCounters will be stored here if check ==1: for li in listCounter: result.merge(li) global sor sor=sorted(result, key=result.get, reverse=True) #if we already have 1 for 1 keyword and title is 2 keyword #ex sachin tendulkar we got Sachin:1 but not right results #right on is Sachin:0s.5 Tendulkar:0.5 #below for loop for doing exact match #print sor for ind,term in enumerate(sor): #remove punctuataion ex ajay k. sood vs ajay k sood t=term.lower()#remove_punctuations(term)# this does not remove hash tags redirect=False if "#r#" in t and t.replace("#r#","")=="_".join(lis): #This condition is for checking if user query matches to a redirect page resolved=resolveRedirects(term.replace("#r#","")) sor[ind],sor[0]=sor[0],resolved wikiAns=[sor[0]] wikiAnsGot=True return elif t.lower()=="_".join(lis): #Not a redirect but a exact match sor[ind],sor[0]=sor[0],sor[ind] b=checkForDis(sor) wikiAns=[b] wikiAnsGot=True return #did'nt got exact match just return 1 #Exact match with user query not found but found something which adds upto 1 if result[sor[0]]==1.0: if "#r#" in sor[0]: resolved=resolveRedirects(sor[0].replace("#r#","")) sor[0]=resolved wikiAns=[sor[0]] wikiAnsGot=True return else: b=checkForDis(sor) wikiAns=[b] wikiAnsGot=True return else: rnks=compareRanks(sor) if not len(rnks)==0: if "#r#" in rnks[0]: wikiAns=[resolveRedirects(rnks[0].replace("#r#",""))] wikiAnsGot=True return else: wikiAns=[rnks[0]] wikiAnsGot=True return else: if "#r#" in sor[0]: wikiAns=[resolveRedirects(sor[0].replace("#r#",""))] wikiAnsGot=True return else: b=checkForDis(sor) wikiAns=[b]#worst result not exact match not ranked wikiAnsGot=True return else: wikiAns=[] wikiAnsGot=True return except Exception as e: typee, value, traceback = sys.exc_info() #print typee # Rollback in case there is any error db.rollback() wikiAns=[] wikiAnsGot=True return # disconnect from server db.close() def waitForAns(): global wikiAnsGot,lis while not wikiAnsGot: pass if len(wikiAns)==0: pass else: addToCache(lis,wikiAns[0]) q=" ".join(lis1)#passing org query global th1,th2 th1=Process(target=checkInCache,args=(q,)) th1.start() th2=Process(target=wikiMain,args=()) th2.start() tt=Process(target=waitForAns,args=()) tt.start() tt.join() return wikiAns #below portion is shared by both news and general db = client[dbase] results=db[coll].find({"keyword":{"$in":lis}},{"keyword":0,"_id":0}) listCounters = list(results) if dbase == "news" : #print len(listCounters),len(lis),lis if len(lis) == len(listCounters): #check =1 #print list(getNewContent(listCounters)) return list(getNewContent(listCounters)) else: check =1 result=SumDict({}) if check ==1: for listCounter in listCounters: result.merge(listCounter) #print result sor=sorted(result, key=result.get, reverse=True) #print sor return sor else: return []
import urllib2 as u import time import parser import threading from pymongo import MongoClient as mc db = mc() db = db["news"] col = db["bhaskar"] START_DATE = time.strptime(open("./crawlers/bhaskar/start_date.txt","r").read(), "%Y-%m-%d") START_DATE = time.mktime(START_DATE) BASE_URL = "http://www.bhaskar.com/archives/" def main(): dates = [] #generate dates to be crawled days = ( int(time.time()) - int(START_DATE) ) / (3600*24) PREV_DATE = START_DATE for i in range(days): day = int(PREV_DATE) + (3600*24) PREV_DATE = day day = time.strftime("%Y-%m-%d", time.localtime(int(day))) dates.append(day) #fetch for each date and insert for day in dates: print BASE_URL+day+"/" fetch_url = BASE_URL+day+"/"
from pymongo import MongoClient as mc import numpy as np from bson.binary import Binary import pickle myclient = mc("mongodb://localhost:27017/") db = myclient["faceszakh"] col = db["faces"] db.faces.insert_one({"name": "Deleteme"}) col.delete_one({'name': {'$eq': 'Deleteme'}}) print('db created') print(myclient.list_database_names())
def Connection(): _client = mc('localhost', 27017) _dataBase = _client['Loterias'] _collections = _dataBase['Contests'] return _collections
def __init__(self): self.host = MONGO_HOST self.port = MONGO_PORT self.client = mc(self.host, self.port) self.db = self.client[MONGO_DB] self.collection = self.db[MONGO_COLLECTION]
def getMongo(): global config from pymongo import MongoClient as mc db=mc(config["mongo_uri"])["iris"] return db