def crawler(): # Establishing connection with mongodb client = pymongo.MongoClient(config["localhost"], config["port_num"]) db = client[config['database_name']] col = db[config['collection_name']] # starting scraping if col.count_documents( {}) == 0: # if collection is empty : scrape flinkhub.com links_list1 = [] headers1 = { 'User-Agent': config['user_agent'], } try: # send request logger.debug("Making HTTP GET request: " + config['host_name']) r1 = requests.get(config['host_name'], headers=headers1) res1 = r1.text logger.debug("Got HTML source, content length = " + str(len(res1))) except: # if cannot request url logger.exception("Failed to get HTML source from " + config['host_name']) traceback.print_exc() return links_list1 logger.debug("Extracting links from the HTML") soup1 = BeautifulSoup( res1, 'html.parser') # converting request to soup object # saving html content to a .txt file try: file_name1 = ''.join( random.choices(string.ascii_uppercase + string.digits, k=16)) file_name1 = file_name1 + '.txt' file_path1 = os.path.join(os.getcwd(), config["file_dir"], file_name1) text_file1 = open(file_path1, "w") n1 = text_file1.write(str(soup1)) text_file1.close() except: logger.exception("Cannot write link in a file.") if 'Content-Length' in r1.headers: new_doc = { "link": config["host_name"], "source_link": None, "is_crawled": True, "last_crawl_date": datetime.datetime.utcnow(), "response_status": r1.status_code, "content_type": r1.headers['Content-Type'], "con_length": r1.headers['Content-Length'], "file_path": file_path1, "created_at": datetime.datetime.utcnow(), } else: new_doc = { "link": config["host_name"], "source_link": None, "is_crawled": True, "last_crawl_date": datetime.datetime.utcnow(), "response_status": r1.status_code, "content_type": r1.headers['Content-Type'], "con_length": len(r1.content), "file_path": file_path1, "created_at": datetime.datetime.utcnow(), } col.insert_one(new_doc) # inserting original link to a document links1 = soup1.find_all("a") # finding all a tags for link1 in links1: # iterating over all the links temp1 = link1.get('href') # getting url if temp1 not in links_list1: # if link was not scraped in the same cycle links_list1.append(temp1) # checking validity of link temp_parse1 = urllib.parse.urlparse(temp1) netloc_bool1 = bool(temp_parse1.netloc) scheme_bool1 = bool(temp_parse1.scheme) if netloc_bool1: if scheme_bool1: # if link is valid and absolute url actual_link1 = temp1 query1 = {"link": actual_link1} myq1 = col.find(query1) if myq1.count == 0: temp_doc1 = { "link": actual_link1, "source_link": config['host_name'], "is_crawled": False, "last_crawl_date": None, "response_status": None, "content_type": None, "con_length": None, "file_path": None, "created_at": datetime.datetime.utcnow() } col.insert_one( temp_doc1) # adding link to a document else: print(temp1 + " already exists in database." ) # if link already exists in database else: print(temp1 + " link not valid") # if link is not valid else: # if link is a relative url actual_link2 = urllib.parse.urljoin( config['host_name'], temp1) netloc_bool2 = bool(urllib.parse.urlparse(actual_link2)) scheme_bool2 = bool(urllib.parse.urlparse(actual_link2)) if netloc_bool2 and scheme_bool2: # if relative url is valid query2 = {"link": actual_link2} if col.count_documents( query2 ) == 0: # if link doesn't exist in collection already temp_doc1 = { "link": actual_link2, "source_link": config['host_name'], "is_crawled": False, "last_crawl_date": None, "response_status": None, "content_type": None, "con_length": None, "file_path": None, "created_at": datetime.datetime.utcnow() } col.insert_one( temp_doc1) # inserting link to collection else: print(str(actual_link2) + " already exists." ) # if link already exists in collection else: print(actual_link2 + " not valid") # if link is not valid else: print(temp1 + " Link already scraped" ) # if link is already scraped in the same cycle return links1 else: # if there exist some links in the collection already if col.count_documents({ "is_crawled": False }) > 0: # if there exist some documents which are not crawled # picking a random link from the collection to be scraped num1 = col.count_documents({"is_crawled": False}) random1 = math.floor(random.random() * num1) cursor_doc = col.find({"is_crawled": False}).limit(1).skip(random1) for curs in cursor_doc: doc = curs links_list2 = [] og_link = doc['link'] headers2 = { 'User-Agent': config['user_agent'], } try: # requesting link logger.debug("Making HTTP GET request: " + og_link) r2 = requests.get(og_link, headers=headers2) res2 = r2.text logger.debug("Got HTML source, content length = " + str(len(res2))) except: logger.exception("Failed to get HTML source from " + og_link) traceback.print_exc() return links_list2 logger.debug("Extracting links from the HTML") soup2 = BeautifulSoup( res2, 'html.parser') # converting request to a soup object # saving html content to a file try: file_name2 = ''.join( random.choices(string.ascii_uppercase + string.digits, k=16)) file_name2 = file_name2 + '.txt' file_path_2 = os.path.join(os.getcwd(), config['file_dir'], file_name2) text_file2 = open(file_path_2, "w") n2 = text_file2.write(str(soup2)) text_file2.close() except: logger.exception("Cannot write link in a file.") if 'Content-Length' in r2.headers: updated_doc = { "is_crawled": True, "last_crawl_date": datetime.datetime.utcnow(), "response_status": r2.status_code, "content_type": r2.headers['Content-Type'], "con_length": r2.headers['Content-Length'], "file_path": file_path_2, } else: updated_doc = { "is_crawled": True, "last_crawl_date": datetime.datetime.utcnow(), "response_status": r2.status_code, "content_type": r2.headers['Content-Type'], "con_length": len(r2.content), "file_path": file_path_2, } col.update_one( doc, {"$set": updated_doc}) # updating link which was just scraped links2 = soup2.find_all("a") # converting request to a soup object for link2 in links2: temp2 = link2.get('href') # getting link from a tag if temp2 not in links_list2: # itertaing through links links_list2.append(temp2) # checking validity of links temp_parse3 = urllib.parse.urlparse(temp2) netloc_bool3 = bool(temp_parse3.netloc) scheme_bool3 = bool(temp_parse3.scheme) if netloc_bool3: if scheme_bool3: # valid absolute link actual_link3 = temp2 query3 = {"link": actual_link3} if col.count_documents(query3) == 0: temp_doc = { "link": actual_link3, "source_link": og_link, "is_crawled": False, "last_crawl_date": None, "response_status": None, "content_type": None, "con_length": None, "file_path": None, "created_at": datetime.datetime.utcnow() } col.insert_one( temp_doc) # adding link to the collection else: print(temp2 + " already exists." ) # if link already exists in collection else: # if link is not valid print(temp2 + " link not valid") else: # link is a relative link actual_link4 = urllib.parse.urljoin(og_link, temp2) netloc_bool4 = bool( urllib.parse.urlparse(actual_link4)) scheme_bool4 = bool( urllib.parse.urlparse(actual_link4)) if netloc_bool4 and scheme_bool4: # valid relative link query4 = {"link": actual_link4} if col.count_documents( query4 ) == 0: # checking for existence of link in collection temp_doc = { "link": actual_link4, "source_link": og_link, "is_crawled": False, "last_crawl_date": None, "response_status": None, "content_type": None, "con_length": None, "file_path": None, "created_at": datetime.datetime.utcnow() } col.insert_one( temp_doc) # adding link to the collection else: print(actual_link4 + " already exists." ) # link already exists in collection else: print(actual_link4 + " not valid") # link is not valid return links2 # return list of links found else: # if there are no links which are not crawled yet valid_docs = col.find({}) # finding links which were not crawled in last 24 hours time_dif = datetime.timedelta(days=1) greater_than_24_docs = [] for single_doc in valid_docs: if single_doc["last_crawl_date"] > time_dif: greater_than_24_docs.append(single_doc) num2 = len(greater_than_24_docs) # picking a random link out of those links which were not crawled in last 24 hours random2 = random.randint(0, num2 - 1) doc = greater_than_24_docs[random2] links_list2 = [] og_link = doc.link headers2 = { 'User-Agent': config['user_agent'], } # making a https request try: logger.debug("Making HTTP GET request: " + og_link) r2 = requests.get(og_link, headers=headers2) res2 = r2.text logger.debug("Got HTML source, content length = " + str(len(res2))) except: logger.exception("Failed to get HTML source from " + og_link) traceback.print_exc() return links_list2 logger.debug("Extracting links from the HTML") soup2 = BeautifulSoup( res2, 'html.parser') # turning request into soup object try: # writing html content to a txt file file_name2 = ''.join( random.choices(string.ascii_uppercase + string.digits, k=16)) file_name2 = file_name2 + '.txt' file_path2 = os.path.join(os.getcwd(), config['file_dir'], file_name2) text_file2 = open(file_path2, "w") n2 = text_file2.write(str(soup2)) text_file2.close() except: logger.exception("Cannot write link in a file.") if 'Content-Length' in r2.headers: updated_doc = { "is_crawled": True, "last_crawl_date": datetime.date.today(), "response_status": r2.status_code, "content_type": r2.headers['Content-Type'], "con_length": r2.headers['Content-Length'], "file_path": file_path2, } else: updated_doc = { "is_crawled": True, "last_crawl_date": datetime.date.today(), "response_status": r2.status_code, "content_type": r2.headers['Content-Type'], "con_length": len(r2.content), "file_path": file_path2, } col.update_one(doc, {"$set": updated_doc }) # updating the recently crawled link document links2 = soup2.find_all("a") # finding all anchor tags for link2 in links2: # iterating through a tags temp2 = link2.get('href') # geting the link from a tag if temp2 not in links_list2: # if link wasn't found in this cycle links_list2.append(temp2) # checking for validity of link temp_parse3 = urllib.parse.urlparse(temp2) netloc_bool3 = bool(temp_parse3.netloc) scheme_bool3 = bool(temp_parse3.scheme) if netloc_bool3: if scheme_bool3: # link is absolute url and valid actual_link3 = temp2 query3 = {"link": actual_link3} if col.count_documents(query3) == 0: temp_doc = { "link": actual_link3, "source_link": og_link, "is_crawled": False, "last_crawl_date": None, "response_status": None, "content_type": None, "con_length": None, "file_path": None, "created_at": datetime.datetime.utcnow() } col.insert_one( temp_doc) # adding link to the collection else: print( temp2 + " already exists." ) # if link already exists in the collection else: print(temp2 + " link not valid") # link is not valid else: # link is a relative link actual_link4 = urllib.parse.urljoin(og_link, temp2) netloc_bool4 = bool( urllib.parse.urlparse(actual_link4)) scheme_bool4 = bool( urllib.parse.urlparse(actual_link4)) if netloc_bool4 and scheme_bool4: # link is relative and valid query4 = {"link": actual_link4} if col.count_documents(query4) == 0: temp_doc = { "link": actual_link4, "source_link": og_link, "is_crawled": False, "last_crawl_date": None, "response_status": None, "content_type": None, "con_length": None, "file_path": None, "created_at": datetime.datetime.utcnow() } col.insert_one( temp_doc ) # adding link document to collection else: print(actual_link4 + " already exists." ) # link already exists in collection else: print(actual_link4 + " not valid") # link is not valid return link2
''' 1. Install MongoDB Enterprise Server 4.2.5 https://www.mongodb.com/download-center/enterprise MongoDB Compass installed as well 2. pip3 install pymongo 3. Add MongoDB server to PATH Control Panel\System and Security\System Advance system settings Environment Variables User variables: Path Add C:\Program Files\MongoDB\Server\4.2\bin 4. MongoDB Server should be running 5. Enter MongoDB with mongo command ''' import pymongo from pprint import pprint client = pymongo.MongoClient('mongodb://127.0.0.1:27017') with client: pprint(dir(client), indent=4) print(client.server_info)
def config(app): """Configure the application""" # Sentry (logging) if app.config.get('SENTRY_DSN'): sentry_logging = LoggingIntegration( level=logging.INFO, event_level=logging.WARNING ) app.sentry = sentry_sdk.init( app.config.get('SENTRY_DSN'), integrations=[ sentry_logging, FlaskIntegration(), RedisIntegration() ] ) # Database (mongo and mongoframes) app.mongo = pymongo.MongoClient(app.config['MONGO_URI']) app.db = app.mongo.get_default_database() mongoframes.Frame._client = app.mongo # Database authentication if app.config.get('MONGO_PASSWORD'): app.db.authenticate( app.config.get('MONGO_USERNAME'), app.config.get('MONGO_PASSWORD') ) # Database (redis) if app.config['REDIS_USE_SENTINEL']: sentinel = Sentinel( app.config['REDIS_ADDRESS'], db=app.config['REDIS_DB'], password=app.config['REDIS_PASSWORD'], decode_responses=True ) app.redis = sentinel.master_for(app.config['REDIS_SENTINEL_MASTER']) else: app.redis = StrictRedis( host=app.config['REDIS_ADDRESS'][0], port=app.config['REDIS_ADDRESS'][1], db=app.config['REDIS_DB'], password=app.config['REDIS_PASSWORD'], decode_responses=True ) # CSRF protection forms.CSRF.init_app(app) # Manage app.manage = manage.Manage(app) # Email if 'EMAIL_BACKEND' in app.config: app.mailer = app.config['EMAIL_BACKEND'].Mailer( **app.config.get('EMAIL_BACKEND_SETTINGS') ) # Set the application's default date format for form fields forms.fields.DateField.default_format = app.config.get('DATE_FORMAT') # Fixes # Increase the default cache size for jinja templates app.jinja_env.cache = create_cache(1000) # REMOTE_ADDR when running behind a proxy server app.wsgi_app = ProxyFix(app.wsgi_app)
""" import pymongo from MaterialPlanning import MaterialPlanning import time from dateutil import parser from utils import required_dctCN, owned_dct, aggregation, collectionCN CCSeason = 3 aggregation(collectionCN, required_dctCN, "阿米娅") update_time = parser.parse(time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime())) print(update_time) print('正在从企鹅物流获取数据...') server = open('data/server.txt', 'r').readline().strip() dbclient = pymongo.MongoClient(server) db = dbclient['Arknights_OneGraph'] Filter_special_items = ['荒芜行动物资补给', '罗德岛物资补给', '岁过华灯', '32h战略配给', '感谢庆典物资补给'] Filter_special_stages = ['S4-4', 'S6-4', 'S4-9'] # Calculation for CN server collection = db['Material_Event'] Event_Stages = ['FA-%d' % x for x in range(1, 9)] mp_event = MaterialPlanning(filter_stages=Filter_special_stages + Filter_special_items, filter_freq=100, update=False, printSetting='000011101111', CCSeason=CCSeason) mp_event.get_plan(required_dctCN,
def __init__(self): self.client = pymongo.MongoClient(DB_CONFIG['DB_CONNECT_STRING'], connect=False)
def get_proddata(self,url): config=self.config pgid =1 client = pymongo.MongoClient(config["mongolink"]) db = client['pharmascrape'] nins=0 self.logger.info("Mega-category:"+config['Mega-category']) self.logger.info("Category:"+config['Category']) self.logger.info("segment:"+config['segment']) self.logger.info("Sub-segment:"+config['Sub-segment']) run = True while (run): soup = self.get_soup(url+"/"+str(pgid)) prods=soup.find_all('div', {"class":"product mb-5"}) self.logger.info("#Found products:" + str(len(prods))) for prod in prods: try: proddict=dict() proddict['Source']=config['site'] proddict['Mega-category']=config['Mega-category'] proddict['Category']=config['Category'] proddict['segment']=config['segment'] proddict['Sub-segment']=config['Sub-segment'] proddict['template']=config['template'] try: proddict['urltoproduct']=config['site']+prod.find("h2",{"class":"product-title"}).find("a")['href'] except Exception as e: self.logger.error("Line 94:"+str(e)) proddict['urltoproduct']="None" try: proddict['Product_name']=prod.find("h2",{"class":"product-title"}).find("a")['title'] if (db['scrapes'].find({"Source":config['site'],"Product_name":proddict['Product_name']}).count()>0): continue except Exception as e: self.logger.error("Line 99:"+str(e)) proddict['Product_name']="None" try: proddict['Price'] = float(prod.find("span",{"class":"price"}).text.replace("\n","").replace("TTC","").replace("\xa0€","").replace(",",".").strip()) except Exception as e: self.logger.error("Line 133:"+str(e)) proddict['Price'] = "None" try: proddict["Brand"] = prod.find("h3",{"class":"product-subtitle"}).text.strip() except Exception as e: self.logger.error("Line 106:"+str(e)) proddict["Brand"] = "None" try: proddict['Crossed_out_Price'] = float(prod.find("s",{"class":"text-promo"}).text.replace("\n","").replace("TTC","").replace("\xa0€","").replace(",",".").strip()) except Exception as e: self.logger.error("Line 133:"+str(e)) proddict['Crossed_out_Price'] = "None" try: proddict["Promotional_claim"] = prod.find("p",{"class":"bg-promo p-2 mb-2 text-center text-promo text-uppercase"}).text.strip() except Exception as e: self.logger.error("Line 146:"+str(e)) proddict["Promotional_claim"] = "None" try: proddict['Imagelink'] = prod.find("img")['src'] proddict['Imagefilename'] = proddict['Imagelink'].split("/")[len(proddict['Imagelink'].split("/"))-1] except Exception as e: self.logger.error("Line 138:"+str(e)) proddict["Imagelink"]="None" proddict["Imagefilename"]="None" db['scrapes'].insert_one(proddict) nins=nins+1 self.logger.info("#insertions:" + str(nins)) except Exception as e: self.logger.info("soup:" + str(prod)) self.logger.error("Line 87:" + str(e)) continue run = self.is_product(url+"/"+str(pgid)) pgid =pgid+1 client.close() pass
def __init__(self): self.myclient = pymongo.MongoClient("mongodb://localhost:27017/") self.mydb = self.myclient["proxies"] self.mycol = self.mydb["proxy"]
import csv import time import pandas as pd import pymongo import configparser config = configparser.ConfigParser() config.read('config.ini') try: client = pymongo.MongoClient(config["DEFAULT"]["MONGO_URL"], ssl_ca_certs='./cert.pem', connectTimeoutMS=30000, socketTimeoutMS=None) db = client.main collection = db.airdrop client.server_info() except pymongo.errors.ServerSelectionTimeoutError as err: print(err) allocated = 0 supply = 3 # Change allocation to zero for all items so they can be reset post_id = collection.update({"supply": supply}, {"$set": {"allocation": 0}}, upsert=True, multi=True) print('Updated '+str(post_id)) with open('../data/advisors.csv', 'rb') as csvfile: addresses = pd.read_csv(csvfile) for index, row in addresses.iterrows(): # Check that address is valid try: allocation = int(row.allocation) except:
# Flask Setup app = Flask(__name__) # Database Setup # The database URI app.config['SQLALCHEMY_DATABASE_URI'] = "sqlite:///db/aqi.sqlite" db = SQLAlchemy(app) # Create connection variable # conn = 'mongodb://*****:*****@ds133202.mlab.com:33202/trafficaq' conn = 'mongodb://*****:*****@ds233452.mlab.com:33452/trafficaq' # Pass connection to the pymongo instance. client = pymongo.MongoClient(conn) # Connect to a database. Will create one if not already available. # db = client.traffic_db mdb = client.trafficaq # Drops collection if available to remove duplicates mdb.trafficAQ.drop() class AQI(db.Model): __tablename__ = 'aqi' id = db.Column(db.Integer, primary_key=True) Latitude = db.Column(db.String) Longitude = db.Column(db.String)
def open_spider(self, spider): self.client = pymongo.MongoClient(self.mongo_uri) self.db = self.client[self.mongo_db] self.db.authenticate(self.mongo_user,self.mongo_pass)
def initializing_main_db(): client = pymongo.MongoClient(Database.uri) Database.db = client['blog']
def __init__(self, mongo_uri, mongo_db): self.client = pymongo.MongoClient(mongo_uri) self.db = self.client[mongo_db] self.db.user.remove()
import json import time from _datetime import datetime from urllib.parse import urlencode import pymongo import scrapy from pandas import DataFrame connection = pymongo.MongoClient('192.168.2.149', 27017) db = connection["chaboshi"] collection = db["chaboshi_car"] model_data = collection.find({}, {"vehicle_id": 1, "maxRegYear": 1, "minRegYear": 1, "_id": 0}) car_msg_list = list(model_data) car_msg_df = DataFrame(car_msg_list) car_msg_df_new = car_msg_df.drop_duplicates('vehicle_id') class ChaboshiGzSpider(scrapy.Spider): name = 'chaboshi_gz' allowed_domains = ['chaboshi.cn'] # start_urls = ['http://chaboshi.cn/'] @classmethod def update_settings(cls, settings): settings.setdict( getattr(cls, 'custom_debug_settings' if getattr(cls, 'is_debug', False) else 'custom_settings', None) or {}, priority='spider')
print("badItems ",self.badItems) def fetch(apps): for app in apps: queue = threadQueue(numOfThreads,app) queue.waitForStop() print(app["id"],"finished") badItems = queue.badItems #错页重爬 for app in badItems: queue = threadQueue(numOfThreads,app) queue.waitForStop() return queue.badItems #mongodb连接 client = pymongo.MongoClient('mongodb://*****:*****@***.***.***.***:27017/steam_db') db = client.steam_db regions = db.China collection = regions.reviews requests.packages.urllib3.disable_warnings() session = requests.session() appInfos = getAllApps() # print(appInfos) numOfThreads = 1 badPages = fetch(appInfos) print("all finished") # http://store.steampowered.com/appreviews/243470?json=1&filter=all&language=all&day_range=360&cursor=*&review_type=all&purchase_type=all&num_per_page=10
def run(self): app = Flask(__name__) csrf = CSRFProtect() SECRET_KEY = os.urandom(32) app.config['SECRET_KEY'] = SECRET_KEY csrf.init_app(app) c = pymongo.MongoClient() mc = pymongo.MongoClient(os.environ['FEDN_MONGO_HOST'], int(os.environ['FEDN_MONGO_PORT']), username=os.environ['FEDN_MONGO_USER'], password=os.environ['FEDN_MONGO_PASSWORD']) mdb = mc[os.environ['ALLIANCE_UID']] alliance = mdb["status"] @app.route('/') def index(): # logs_fancy = str() # for log in self.logs: # logs_fancy += "<p>" + log + "</p>\n" client = self.name state = ReducerStateToString(self.control.state()) logs = None refresh = True return render_template('index.html', client=client, state=state, logs=logs, refresh=refresh, dashboardhost=os.environ["FEDN_DASHBOARD_HOST"], dashboardport=os.environ["FEDN_DASHBOARD_PORT"]) # http://localhost:8090/add?name=combiner&address=combiner&port=12080&token=e9a3cb4c5eaff546eec33ff68a7fbe232b68a192 @app.route('/add') def add(): # TODO check for get variables name = request.args.get('name', None) address = request.args.get('address', None) port = request.args.get('port', None) # token = request.args.get('token') # TODO do validation if port is None or address is None or name is None: return "Please specify correct parameters." certificate, key = self.certificate_manager.get_or_create(address).get_keypair_raw() import base64 cert_b64 = base64.b64encode(certificate) key_b64 = base64.b64encode(key) # TODO append and redirect to index. import copy combiner = CombinerInterface(self, name, address, port, copy.deepcopy(certificate), copy.deepcopy(key)) self.control.add(combiner) ret = {'status': 'added', 'certificate': str(cert_b64).split('\'')[1], 'key': str(key_b64).split('\'')[1]} # TODO remove ugly string hack return jsonify(ret) @app.route('/seed', methods=['GET', 'POST']) def seed(): if request.method == 'POST': # upload seed file uploaded_seed = request.files['seed'] if uploaded_seed: self.control.commit(uploaded_seed.filename, uploaded_seed) else: h_latest_model_id = self.control.get_latest_model() model_info = self.control.get_model_info() return render_template('index.html', h_latest_model_id=h_latest_model_id, seed=True, model_info=model_info) seed = True return redirect(url_for('seed', seed=seed)) # http://localhost:8090/start?rounds=4&model_id=879fa112-c861-4cb1-a25d-775153e5b548 @app.route('/start', methods=['GET', 'POST']) def start(): if request.method == 'POST': timeout = request.form.get('timeout', 180) rounds = int(request.form.get('rounds', 1)) task = (request.form.get('task', '')) active_clients = request.form.get('active_clients', 2) clients_required = request.form.get('clients_required', 2) clients_requested = request.form.get('clients_requested', 8) latest_model_id = self.control.get_latest_model() config = {'round_timeout': timeout, 'model_id': latest_model_id, 'rounds': rounds, 'active_clients': active_clients, 'clients_required': clients_required, 'clients_requested': clients_requested, 'task': task} self.control.instruct(config) return redirect(url_for('index', message="Sent execution plan.")) else: # Select rounds UI rounds = range(1, 100) latest_model_id = self.control.get_latest_model() return render_template('index.html', round_options=rounds, latest_model_id=latest_model_id) client = self.name state = ReducerStateToString(self.control.state()) logs = None refresh = False return render_template('index.html', client=client, state=state, logs=logs, refresh=refresh) @app.route('/assign') def assign(): name = request.args.get('name', None) combiner_preferred = request.args.get('combiner', None) import uuid id = str(uuid.uuid4()) if combiner_preferred: combiner = self.control.find(combiner_preferred) else: combiner = self.control.find_available_combiner() if combiner: # certificate, _ = self.certificate_manager.get_or_create(combiner.name).get_keypair_raw() import base64 cert_b64 = base64.b64encode(combiner.certificate) response = {'host': combiner.address, 'port': combiner.port, 'certificate': str(cert_b64).split('\'')[1]} return jsonify(response) elif combiner is None: abort(404, description="Resource not found") # 1.receive client parameters # 2. check with available combiners if any clients are needed # 3. let client know where to connect. return @app.route('/infer') def infer(): result = "" try: self.control.set_model_id() except fedn.exceptions.ModelError: print("Failed to seed control.") return result # plot metrics from DB def _scalar_metrics(metrics): """ Extract all scalar valued metrics from a MODEL_VALIDATON. """ data = json.loads(metrics['data']) data = json.loads(data['data']) valid_metrics = [] for metric, val in data.items(): # If it can be converted to a float it is a valid, scalar metric try: val = float(val) valid_metrics.append(metric) except: pass return valid_metrics @app.route('/plot') def plot(): box = 'box' plot = create_plot(box) show_plot = True return render_template('index.html', show_plot=show_plot, plot=plot) def create_plot(feature): if feature == 'table': return create_table_plot() elif feature == 'timeline': return create_timeline_plot() elif feature == 'ml': return create_ml_plot() elif feature == 'box': return create_box_plot() else: return 'No plot!' @app.route('/plot_type', methods=['GET', 'POST']) def change_features(): feature = request.args['selected'] graphJSON = create_plot(feature) return graphJSON def create_table_plot(): metrics = alliance.find_one({'type': 'MODEL_VALIDATION'}) if metrics == None: fig = go.Figure(data=[]) fig.update_layout(title_text='No data currently available for mean metrics') table = json.dumps(fig, cls=plotly.utils.PlotlyJSONEncoder) return table valid_metrics = _scalar_metrics(metrics) if valid_metrics == []: fig = go.Figure(data=[]) fig.update_layout(title_text='No scalar metrics found') table = json.dumps(fig, cls=plotly.utils.PlotlyJSONEncoder) return table all_vals = [] models = [] for metric in valid_metrics: validations = {} for post in alliance.find({'type': 'MODEL_VALIDATION'}): e = json.loads(post['data']) try: validations[e['modelId']].append(float(json.loads(e['data'])[metric])) except KeyError: validations[e['modelId']] = [float(json.loads(e['data'])[metric])] vals = [] models = [] for model, data in validations.items(): vals.append(numpy.mean(data)) models.append(model) all_vals.append(vals) header_vals = valid_metrics models.reverse() values = [models] print(all_vals, flush=True) for vals in all_vals: vals.reverse() values.append(vals) fig = go.Figure(data=[go.Table( header=dict(values=['Model ID'] + header_vals, line_color='darkslategray', fill_color='lightskyblue', align='left'), cells=dict(values=values, # 2nd column line_color='darkslategray', fill_color='lightcyan', align='left')) ]) fig.update_layout(title_text='Summary: mean metrics') table = json.dumps(fig, cls=plotly.utils.PlotlyJSONEncoder) return table def create_timeline_plot(): trace_data = [] x = [] y = [] base = [] for p in alliance.find({'type': 'MODEL_UPDATE_REQUEST'}): e = json.loads(p['data']) cid = e['correlationId'] for cc in alliance.find({'sender': p['sender'], 'type': 'MODEL_UPDATE'}): da = json.loads(cc['data']) if da['correlationId'] == cid: cp = cc cd = json.loads(cp['data']) tr = datetime.strptime(e['timestamp'], '%Y-%m-%d %H:%M:%S.%f') tu = datetime.strptime(cd['timestamp'], '%Y-%m-%d %H:%M:%S.%f') ts = tu - tr base.append(tr.timestamp()) x.append(ts.total_seconds()) y.append(p['sender']['name']) trace_data.append(go.Bar( x=x, y=y, orientation='h', base=base, marker=dict(color='royalblue'), name="Training", )) x = [] y = [] base = [] for p in alliance.find({'type': 'MODEL_VALIDATION_REQUEST'}): e = json.loads(p['data']) cid = e['correlationId'] for cc in alliance.find({'sender': p['sender'], 'type': 'MODEL_VALIDATION'}): da = json.loads(cc['data']) if da['correlationId'] == cid: cp = cc cd = json.loads(cp['data']) tr = datetime.strptime(e['timestamp'], '%Y-%m-%d %H:%M:%S.%f') tu = datetime.strptime(cd['timestamp'], '%Y-%m-%d %H:%M:%S.%f') ts = tu - tr base.append(tr.timestamp()) x.append(ts.total_seconds()) y.append(p['sender']['name']) trace_data.append(go.Bar( x=x, y=y, orientation='h', base=base, marker=dict(color='lightskyblue'), name="Validation", )) layout = go.Layout( barmode='stack', showlegend=True, ) fig = go.Figure(data=trace_data, layout=layout) fig.update_xaxes(title_text='Timestamp') fig.update_layout(title_text='Alliance timeline') # tab = go.Figure(data=[go.Table( # header=dict(values=['Model updates', 'Model Validations'], # line_color='darkslategray', # fill_color='lightskyblue', # align='left'), # cells=dict(values=[[100, 90, 80, 90], # 1st column # [95, 85, 75, 95]], # 2nd column # line_color='darkslategray', # fill_color='lightcyan', # align='left')) # ]) # # tab.update_layout(width=500, height=300) # tab.update_layout(title_text='Summary') timeline = json.dumps(fig, cls=plotly.utils.PlotlyJSONEncoder) return timeline def create_ml_plot(): metrics = alliance.find_one({'type': 'MODEL_VALIDATION'}) if metrics == None: fig = go.Figure(data=[]) fig.update_layout(title_text='No data currently available for Mean Absolute Error') ml = json.dumps(fig, cls=plotly.utils.PlotlyJSONEncoder) return ml data = json.loads(metrics['data']) data = json.loads(data['data']) valid_metrics = [] for metric, val in data.items(): # Check if scalar - is this robust ? if isinstance(val, float): valid_metrics.append(metric) # Assemble a dict with all validations validations = {} clients = {} for post in alliance.find({'type': 'MODEL_VALIDATION'}): try: e = json.loads(post['data']) clients[post['sender']['name']].append(json.loads(e['data'])[metric]) except KeyError: clients[post['sender']['name']] = [] rounds = [] traces_data = [] for c in clients: print(clients[c], flush=True) traces_data.append(go.Scatter( x=rounds, y=clients[c], name=c )) fig = go.Figure(traces_data) fig.update_xaxes(title_text='Rounds') fig.update_yaxes(title_text='MAE', tickvals=[0.0, 0.2, 0.4, 0.6, 0.8, 1.0]) fig.update_layout(title_text='Mean Absolute Error Plot') ml = json.dumps(fig, cls=plotly.utils.PlotlyJSONEncoder) return ml def create_box_plot(): metrics = alliance.find_one({'type': 'MODEL_VALIDATION'}) if metrics == None: fig = go.Figure(data=[]) fig.update_layout(title_text='No data currently available for metric distribution over alliance ' 'participants') box = json.dumps(fig, cls=plotly.utils.PlotlyJSONEncoder) return box valid_metrics = _scalar_metrics(metrics) if valid_metrics == []: fig = go.Figure(data=[]) fig.update_layout(title_text='No scalar metrics found') box = json.dumps(fig, cls=plotly.utils.PlotlyJSONEncoder) return box # Just grab the first metric in the list. # TODO: Let the user choose, or plot all of them. if "accuracy" in valid_metrics: metric = "accuracy" else: metric = valid_metrics[0] validations = {} for post in alliance.find({'type': 'MODEL_VALIDATION'}): e = json.loads(post['data']) try: validations[e['modelId']].append(float(json.loads(e['data'])[metric])) except KeyError: validations[e['modelId']] = [float(json.loads(e['data'])[metric])] box = go.Figure() x = [] y = [] box_trace = [] for model_id, acc in validations.items(): x.append(model_id) y.append(numpy.mean([float(i) for i in acc])) if len(acc) >= 2: box.add_trace(go.Box(y=acc, name=str(model_id), marker_color="royalblue", showlegend=False)) rounds = list(range(len(y))) box.add_trace(go.Scatter( x=x, y=y, name='Mean' )) box.update_xaxes(title_text='Model ID') box.update_yaxes(tickvals=[0.0, 0.2, 0.4, 0.6, 0.8, 1.0]) box.update_layout(title_text='Metric distribution over alliance participants: {}'.format(metric)) box = json.dumps(box, cls=plotly.utils.PlotlyJSONEncoder) return box # @app.route('/seed') # def seed(): # try: # result = self.inference.infer(request.args) # except fedn.exceptions.ModelError: # print("no model") # # return result # import os, sys # self._original_stdout = sys.stdout # sys.stdout = open(os.devnull, 'w') if self.certificate: print("trying to connect with certs {} and key {}".format(str(self.certificate.cert_path), str(self.certificate.key_path)), flush=True) app.run(host="0.0.0.0", port="8090", ssl_context=(str(self.certificate.cert_path), str(self.certificate.key_path)))
# !/usr/bin/env python3 # -*- coding: utf-8 -*- """ @Time : 2019/10/28 14:28 @Auth : 明明 @IDE : PyCharm """ # -*- coding: utf-8 -* import json import xlwt import os import sys import pymongo from setting import MONGO_HOST, MONGO_PORT, REDIS_HOST, REDIS_PORT client = pymongo.MongoClient(MONGO_HOST, MONGO_PORT) # def readjson(): # db = client["OTHERS"] # collection = db["zhihu_answer"] # info_li = [] # info = collection.find({}) # j = 0 # for i in info: # j += 1 # if j > 2000: # break # info_li.append(i) # return info_li
def conn(self): client = pymongo.MongoClient(host=Config().MONGODB['host'], port=Config().MONGODB['port']) self.tdb = client[Config().MONGODB['dbName']]
def get_coll(): client = pymongo.MongoClient('127.0.0.1', 27017) db = client.nnn user = db.user_colletion return user
import re from selenium import webdriver from selenium.common.exceptions import TimeoutException from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from bs4 import BeautifulSoup from config import * import pymongo client = pymongo.MongoClient(MONGO_URL) db = client[MONGO_DB] browser = webdriver.PhantomJS(service_args=SERVICE_ARGS) wait = WebDriverWait(browser, 10) browser.set_window_size(1400, 900) ## def search(): try: browser.get('https://www.taobao.com/') input = wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, '#q'))) submit = wait.until( EC.element_to_be_clickable( (By.CSS_SELECTOR, '#J_TSearchForm > div.search-button > button'))) input.send_keys('电脑') submit.click() total = wait.until(
def open_spider(self, spider): self.client = pymongo.MongoClient(self.mongo_uri) self.db = self.client[self.mong_db]
import logging import random import pickle import os import pymongo from telegram.ext import ConversationHandler from telegram import ReplyKeyboardMarkup, ReplyKeyboardRemove, ChatAction from quizbot.quiz.question_factory import QuestionBool, QuestionChoice, QuestionChoiceSingle, \ QuestionNumber, QuestionString from quizbot.quiz.attempt import Attempt logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO) logger = logging.getLogger(__name__) db = pymongo.MongoClient(os.environ.get('MONGODB')).quizzes # Dict to store user data like an attempt instance userDict = dict() def start(update, _): """ Starts a conversation about an attempt at a quiz. Welcomes the user and asks for a quiz. """ logger.info('[%s] Attempt initialized', update.message.from_user.username) if update.message.from_user.id in userDict: # user is in the middle of a quiz and can't attempt a second one logger.info('[%s] Attempt canceled because the user is in the middle of a quiz.', update.message.from_user.username)
def __init__(self): self.client = pymongo.MongoClient(host=mongo_host, connect=False) self.db = self.client["zimuzu"]
import pymongo import os from flask.ext.httpauth import HTTPBasicAuth def encode_mongo_obj(obj): """Convert a Mongo object into a dict for Flask-RESTful""" obj['_id'] = str(obj['_id']) return obj # Database stuff is here because we don't wanna connect multiple times uri = os.environ.get('MONGOLAB_URI') print("connecting to URI: {}".format(uri)) client = pymongo.MongoClient(uri) db = client.get_default_database() # Auth stuff is global auth = HTTPBasicAuth()
#T Fabiha #SoftDev2 pd7 #K06 -- Yummy Mongo Py #2019-03-01 import pymongo SERVER_ADDR = "159.65.231.92" connection = pymongo.MongoClient(SERVER_ADDR) db = connection.test collection = db.restaurants def in_borough( borough ): obj = collection.find({"borough" : borough}) for i in obj: print(i) def in_zip( zipcode ): obj = collection.find({"address.zipcode" : zipcode}) for i in obj: print(i) def in_zip_w_grade( zipcode, grade ): obj = collection.find({"$and" : [{"address.zipcode" : zipcode}, {"grades.grade" : grade}]}) for i in obj: print(i) def in_zip_below( zipcode, score ):
def __init__(self): client = pymongo.MongoClient("mongo", 27017) self.account_collection = client["accounts"]['account']
def check_mongodb(host, port, user, passwd, server_id, tags): try: func.mysql_exec( "insert into mongodb_status_history SELECT *,LEFT(REPLACE(REPLACE(REPLACE(create_time,'-',''),' ',''),':',''),12) from mongodb_status where server_id='%s';" % (server_id), '') func.mysql_exec( "delete from mongodb_status where server_id='%s';" % (server_id), '') #connect = pymongo.Connection(host,int(port)) client = pymongo.MongoClient(host, int(port)) db = client['admin'] db.authenticate(user, passwd) serverStatus = client.admin.command( bson.son.SON([('serverStatus', 1), ('repl', 2)])) time.sleep(1) serverStatus_2 = client.admin.command( bson.son.SON([('serverStatus', 1), ('repl', 2)])) connect = 1 ok = int(serverStatus['ok']) version = serverStatus['version'] uptime = serverStatus['uptime'] connections_current = serverStatus['connections']['current'] connections_available = serverStatus['connections']['available'] globalLock_activeClients = serverStatus['globalLock']['activeClients'][ 'total'] globalLock_currentQueue = serverStatus['globalLock']['currentQueue'][ 'total'] mem_bits = serverStatus['mem']['bits'] mem_resident = serverStatus['mem']['resident'] mem_virtual = serverStatus['mem']['virtual'] mem_supported = serverStatus['mem']['supported'] mem_mapped = serverStatus['mem']['mapped'] mem_mappedWithJournal = serverStatus['mem']['mappedWithJournal'] network_bytesIn_persecond = int( serverStatus_2['network']['bytesIn']) - int( serverStatus['network']['bytesIn']) network_bytesOut_persecond = int( serverStatus_2['network']['bytesOut']) - int( serverStatus['network']['bytesOut']) network_numRequests_persecond = int( serverStatus_2['network']['numRequests']) - int( serverStatus['network']['numRequests']) opcounters_insert_persecond = int( serverStatus_2['opcounters']['insert']) - int( serverStatus['opcounters']['insert']) opcounters_query_persecond = int( serverStatus_2['opcounters']['query']) - int( serverStatus['opcounters']['query']) opcounters_update_persecond = int( serverStatus_2['opcounters']['update']) - int( serverStatus['opcounters']['update']) opcounters_delete_persecond = int( serverStatus_2['opcounters']['delete']) - int( serverStatus['opcounters']['delete']) opcounters_command_persecond = int( serverStatus_2['opcounters']['command']) - int( serverStatus['opcounters']['command']) #replset try: repl = serverStatus['repl'] setName = repl['setName'] replset = 1 if repl['secondary'] == True: repl_role = 'secondary' repl_role_new = 's' else: repl_role = 'master' repl_role_new = 'm' except: replset = 0 repl_role = 'master' repl_role_new = 'm' pass ##################### insert data to mysql server############################# sql = "insert into mongodb_status(server_id,host,port,tags,connect,replset,repl_role,ok,uptime,version,connections_current,connections_available,globalLock_currentQueue,globalLock_activeClients,mem_bits,mem_resident,mem_virtual,mem_supported,mem_mapped,mem_mappedWithJournal,network_bytesIn_persecond,network_bytesOut_persecond,network_numRequests_persecond,opcounters_insert_persecond,opcounters_query_persecond,opcounters_update_persecond,opcounters_delete_persecond,opcounters_command_persecond) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s);" param = (server_id, host, port, tags, connect, replset, repl_role, ok, uptime, version, connections_current, connections_available, globalLock_currentQueue, globalLock_activeClients, mem_bits, mem_resident, mem_virtual, mem_supported, mem_mapped, mem_mappedWithJournal, network_bytesIn_persecond, network_bytesOut_persecond, network_numRequests_persecond, opcounters_insert_persecond, opcounters_query_persecond, opcounters_update_persecond, opcounters_delete_persecond, opcounters_command_persecond) func.mysql_exec(sql, param) role = 'm' func.update_db_status_init(repl_role_new, version, host, port, tags) except Exception, e: logger_msg = "check mongodb %s:%s : %s" % (host, port, e) logger.warning(logger_msg) try: connect = 0 sql = "insert into mongodb_status(server_id,host,port,tags,connect) values(%s,%s,%s,%s,%s)" param = (server_id, host, port, tags, connect) func.mysql_exec(sql, param) except Exception, e: logger.error(e) sys.exit(1)
def getAnswers(): myClient = pymongo.MongoClient(mongo_connectStr) mydb = myClient['helpit'] user_collection = mydb['user'] topic_collection = mydb['topics'] analysis_collection = mydb['analysis'] user_accounts = user_collection.find({'source': 'stackoverflow'}) today = datetime.date.today() lastweek = today - datetime.timedelta(days=7) start = str( int( datetime.datetime(lastweek.year, lastweek.month, lastweek.day, 0, 0, 0).timestamp())) for account in user_accounts: if 'account' in account.keys(): accountid = int(account['account']) upn = account['upn'] url_str = "https://api.stackexchange.com/2.2/users/" + str( accountid ) + "/answers?order=desc&sort=activity&site=stackoverflow&filter=!b1MMEr*sm*wys1&pagesize=100&fromdate=" + start pageIndex = 1 while True: data = requests.get(url_str + "&page=" + str(pageIndex), headers={ "Content-type": "text/json" }).json() messages = data['items'] if messages is None or len(messages) == 0: break pageIndex += 1 for j in range(len(messages)): post_time = datetime.datetime.fromtimestamp( messages[j]['last_activity_date']) is_accepted = messages[j]['is_accepted'] comments = [] comment_count = messages[j]['comment_count'] topics = getTopics(topic_collection, messages[j]['body']) score = 0.0 if comment_count > 0: comments = messages[j]['comments'] for t in range(comment_count): score += getSentiment(comments[t]['body']) if is_accepted: score = score * 1.5 score += 0.5 analysis_collection.insert_one({ 'userId': accountid, 'upn': upn, 'source': 'stackoverflow', 'post_time': post_time, 'is_accepted': is_accepted, 'comment_count': comment_count, 'score': score, 'topic': topics }) report_col = mydb['report'] stime = datetime.datetime(lastweek.year, lastweek.month, lastweek.day, 0, 0, 0) today_time = datetime.datetime(today.year, today.month, today.day, 0, 0, 0) while stime < today_time: stime += datetime.timedelta(days=1) dtmp = stime + datetime.timedelta(days=1) adocs = analysis_collection.aggregate([{ "$match": { 'post_time': { '$lt': dtmp, '$gte': stime }, 'source': 'stackoverflow' } }, { "$group": { '_id': '$upn', 'score': { '$sum': '$score' }, 'count': { '$sum': 1 } } }]) for a in adocs: report_col.insert_one({ 'upn': a['_id'], 'source': 'stackoverflow', 'count': a['count'], 'score': a['score'], 'date': time.mktime(stime.timetuple()) * 1000 })
# -*- coding: utf-8 -*- # 2018/4/10 9:03 # 爬取豆瓣音乐top250的数据,并写入mongodb数据库中。包括:歌名,表演者,流派,发行时间,出版者和评分 ''' 之所以选用正则表达式,主要是因为歌手的信息格式不一致。 比如https://music.douban.com/subject/6064884/ https://music.douban.com/subject/4060882/ 运行参考时间:412s ''' import requests import pymongo import time import re client = pymongo.MongoClient('localhost', 27017) mydb = client['mydb'] musictop = mydb['musictop'] headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3371.0 Safari/537.36', } def get_url(url): t = requests.get(url, headers=headers).text hrefs = re.findall('<a class="nbg" href="(.*?)"', t, re.S) for href in hrefs: res = requests.get(href, headers=headers) name = re.findall('<div id="wrapper">.*?<h1>.*?<span>(.*?)</span>', res.text, re.S)[0] author = re.findall('表演者:.*?>(.*?)</a>', res.text, re.S)[0]
import pymongo myclient = pymongo.MongoClient("mongodb://localhost:27017/") mydb = myclient["testdatabase"] mycol = mydb["employee"] mydict = { "name": "John", "address": "Highway 37" } x = mycol.insert_one(mydict) print("last insrted ID : ", x.inserted_id)
with concurrent.futures.ThreadPoolExecutor() as executor: threads = [executor.submit(crawler) for i in range(5)] results = [thread.result() for thread in threads] return results # list of lists returned by 5 threads except: logger.debug("Threads couldn't be made.") if __name__ == "__main__": while True: logger.debug('Starting process') scraped_links = thread_crawler() # starting process for result in scraped_links: try: logger.debug("Extracted " + str(len(result)) + " links from HTML") except: logger.debug("Nothing returned in this cycle.") main_client = pymongo.MongoClient(config['localhost'], config['port_num']) database = main_client[config['database_name']] collection = database[config['collection_name']] if collection.count_documents( {} ) >= config['max_limit']: # check if we have scraped max tweets yet logger.debug( str(config['max_limit']) + " links scraped. Ending process!!!") break time.sleep(5.0)