def get_data(movie_id): url = "https://movie.douban.com/subject/{}".format(movie_id) r = requests.get(url, headers=HEADERS) if r.status_code != 200: raise Exception(r.status_code) content = r.content s = content.find("<span class=\"year\">") s = content.find("(", s + 1) e = content.find("</span>", s + 1) year = int(content[s + 1:e - 1]) s = content.find("<script type=\"application/ld+json\">") s = content.find("{", s + 1) e = content.find("</script>", s + 1) json_data = json.loads(content[s:e - 1], strict=False) name = json_data.get(u'name') score = json_data.get(u'aggregateRating').get(u'ratingValue') if score: score = float(score) else: score = "NULL" votes = json_data.get(u'aggregateRating').get(u'ratingCount') if votes: votes = int(votes) else: votes = "NULL" if db.check_exists(movie_id): db.update_data(movie_id, name, year, score, votes) else: db.insert_data(movie_id, name, year, score, votes) logging.info("finish %s %s %s %s %s", movie_id, name, year, score, votes) global backup_list if len(backup_list) < 100: find_next(content)
def crawl_community_list(): global street_code_list street_code_list = query_data(4) try: page_urls = generate_page_url() for k,page_item in enumerate(page_urls): page_url = page_item['page_url'] print page_url if (page_url in special_url_conn): for item in special_url: response = request_util(item['page_url'],item['encoding']); else: response = request_util(page_url,'gbk'); soup = BeautifulSoup(response, "lxml") info_list = soup.find('table',class_="villagetable").find_all("tr",class_="villagetr") for k,item in enumerate(info_list): #street_url = street_url_prefix + item.contents[0].a.attrs['href'].encode('utf8') code = item.contents[0].get_text().encode('utf8') name = item.contents[2].get_text().encode('utf8') parent_code,parent_name = get_street_code(code) level = 5 print code, name, parent_code, parent_name insert_data(code, name, parent_code, parent_name, level) except Exception, e: print traceback.format_exc()
def crawl_district_list(): global city_code_list city_code_list = query_data(2) try: page_urls = generate_page_url() for k, page_url in enumerate(page_urls): print page_url response = request_util(page_url, 'gbk') soup = BeautifulSoup(response, "lxml") info_list = soup.find('table', class_="countytable").find_all( "tr", class_="countytr") for k, item in enumerate(info_list): if item.contents[0].find('a', {'href': True}): #street_url = common_url_prefix + url_code + item.contents[0].a.attrs['href'].encode('utf8') code = item.contents[0].a.get_text().encode('utf8') name = item.contents[1].a.get_text().encode('utf8') parent_code, parent_name = get_city_code(code) level = 3 print code, name, parent_code, parent_name insert_data(code, name, parent_code, parent_name, level) #crawl_street_detail(street_url) except Exception, e: print traceback.format_exc()
def isLogin(): # 通过查看用户个人信息来判断是否已经登录 timestamp = time.time() * 1000 url = "https://dig.chouti.com/getTopTenLinksOrComments.json?_=%s" % timestamp # 禁止重定向,否则登录失败重定向到首页也是响应200 response = session.get(url, headers=headers, allow_redirects=False) resp_json = response.json() data = resp_json.get("result") data = data.get("data") # print(data[1])\ chouti_ids = [] for exits in data: chouti_id = exits.get("id") chouti_ids.append(chouti_id) exits_ids = get_data_exits(*chouti_ids) end_ids = list(set(chouti_ids).difference(set(exits_ids))) for row in data: # 抽屉id chouti_id = row.get("id") chouti_content = row if chouti_id not in end_ids: pprint.pprint(chouti_content) continue # 抽屉内容 chouti_comments = comments(chouti_id) # 抽屉评论 save_data = dict(chouti_id=chouti_id, chouti_content=chouti_content, chouti_comments=chouti_comments) insert_data(**save_data) login_code = response.status_code if login_code == 200: return True else: return False
def crawl_province_list(): response = request_util(province_url) try: soup = BeautifulSoup(response, "lxml") info_list = soup.find('div', class_="TRS_PreAppend").find_all( "p", class_="MsoNormal") for k, item in enumerate(info_list): code_item = {} code = item.find("span", attrs={'lang': 'EN-US'}) code_item['code'] = code.get_text().strip() content_list = item.find_all("span") code_item['name'] = content_list[len(content_list) - 1].get_text().strip() code_list.append(code_item) for k, item in enumerate(code_list): if item['code'].find("0000") > 0: code = item['code'].encode('utf8') + "000000" name = item['name'].encode('utf8') parent_code = 0 parent_name = "" level = 1 # p_code_item = {} # p_code_item['code'] = code # p_code_item['name'] = name # province_code_list.append(p_code_item) insert_data(code, name, parent_code, parent_name, level) except Exception, e: print traceback.format_exc()
def output(self): '''Store data to MongoDB.''' data = { 'user_url': self.url, 'user_name': self.user_name, 'user_gender': self.user_gender, 'user_location': self.user_location, 'user_followees': self.user_followees, 'user_followers': self.user_followers, 'user_be_agreed': self.user_be_agreed, 'user_be_thanked': self.user_be_thanked, 'user_education_school': self.user_education_school, 'user_education_subject': self.user_education_subject, 'user_employment': self.user_employment, 'user_employment_extra': self.user_employment_extra, 'user_bio': self.user_bio, 'user_content': self.user_content, 'user_topics': self.user_topics, 'user_answers': self.user_answers, 'user_topics_num': self.user_topics_num, 'user_questions_num': self.user_questions_num, 'user_answers_num': self.user_answers_num, 'user_articles_num': self.user_articles_num, 'user_favorites_num': self.user_favorites_num } insert_data(data)
def crawl_street_list(): # page_url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/52/03/520324.html" # response = request_util(page_url,'gb18030'); # print response # return global district_code_list district_code_list = query_data(3) try: page_urls = generate_page_url() for k,page_item in enumerate(page_urls): page_url = page_item['page_url'] print page_url if (page_url in special_url_conn): for item in special_url: response = request_util(item['page_url'],item['encoding']); else: response = request_util(page_url,'gbk'); soup = BeautifulSoup(response, "lxml") info_list = soup.find('table',class_="towntable").find_all("tr",class_="towntr") for k,item in enumerate(info_list): if item.contents[0].find('a',{'href':True}): #street_url = street_url_prefix + item.contents[0].a.attrs['href'].encode('utf8') code = item.contents[0].a.get_text().encode('utf8') name = item.contents[1].a.get_text().encode('utf8') parent_code,parent_name = get_district_code(code) level = 4 print code, name, parent_code, parent_name insert_data(code, name, parent_code, parent_name, level) except Exception, e: print traceback.format_exc()
def add_user(): user_data = request.get_json() new_user = User(first_name=user_data['first_name'], last_name=user_data['last_name'], email=user_data['email'], password=user_data['password']) db.insert_data(new_user) return "Connected to the data base and added new_user!"
def add_new_stock(): global conn data = {"stock_code": (input("请输入股票代码:\n")), "market": (input("请输入股票市场(沪A为1,深A为2):\n")), "cost_price": str(input("请输入成本价:\n")), "shares_held": str(input("请输入持有份额:\n"))} db.insert_data(conn, data) print(format.format_light_content("添加成功."))
def queue(id, title): if not db.is_exist(id): player = kodi() db.insert_data({"youtube_id": id, "title": title, "status": "idle"}) player.GUI.ShowNotification(title=title, message="Successfully Queued", displaytime=20000) return "Song Successfully Queued" else: return "Song already been in queued"
def index(): temp = pickle.dumps(np.array([123, 1234, 456])) temp_dict = {'data': temp} db.insert_data('faces', temp_dict) folders = os.listdir( '/home/praneet/Downloads/Compressed/images_data/Andrews College/') return render_template('index.html', folders=folders)
def register(): if request.method == 'GET': return render_template("register.html") else: email = request.form['email'] pwd = request.form['pwd'] print("전달된값:", email, pwd) db.insert_data(email, pwd) return '회원가입 데이터(POST)'
def count_worlds_at_url(url): resp = requests.get(url,timeout=2) create_log(log_name="url") data = {"url":url,"" "status_code":resp.status_code} insert_data(data) logging.info(resp.status_code) if resp.status_code != 200: return url
def anadir(self): print("---------------------") print("Añadir nuevo contacto") print("---------------------") nombre = input("Introduzca el nombre: ") apellidos = input("Introduzca el apellido: ") telefono = input("Introduzca el teléfono: ") email = input("Introduzca el email: ") #self.contactos.append({'nombre': nom, 'telf': telf, 'email': email}) insert_data(nombre, apellidos, telefono, email)
def saveData(): # get values as json values = request.get_json() image_type = values.get('type') data = values.get('fields') db.insert_data(image_type, args_dict=data) return jsonify({'status': True})
def write_data(): table_name = 'data' db.create_tables(table_name, 0) data = gen_data() str_data = '' for i in data: str_data += '(' + str(i[0]) + ',\'' + i[1] + '\',' + str( i[2]) + ',' + str(i[3]) + '),' db.insert_data(table_name, str_data[:-1])
def message_processing(update, context): """ Function saves voice messages in wav format with simple rate 16MHz and photos if a face is detected there. All path store in database `bot`. :return: None """ logger.info(f'Waiting for message_processing function for ' f'{update.effective_user.name} at ' f'{update.effective_message.date}') user_id = update.effective_user.id conn, cursor = create_conn() if update.message.voice: filename, new_filename = download_file(update, context, 'voice') new_filename = f'{new_filename}.wav' convert(os.path.join(CONFIG['VOICE_FOLDER_PATH'], filename), new_filename) insert_data(conn, cursor, 'voice', user_id, 'audio_path', new_filename) answer_text = 'Thanks, I\'ve saved this voice message to my database.' elif update.message.photo: filename, new_filename = download_file(update, context, 'photo') new_filename = f'{new_filename}.jpg' PHOTO_FOLDER_PATH = CONFIG['PHOTO_FOLDER_PATH'] if check_face(f'{PHOTO_FOLDER_PATH}/{user_id}_photo.jpg', new_filename): insert_data(conn, cursor, 'photo', user_id, 'photo_path', new_filename) answer_text = ('I saved this photo in the database because I\'ve ' 'detected the face here.') else: answer_text = ('I didn\'t save this photo in my database, because ' 'I haven\'t found the face here.') else: context.bot.send_sticker(chat_id=update.effective_chat.id, sticker=CONFIG['STICKER_PATH']) answer_text = 'Send me a voice message or a photo, please.' context.bot.send_message(parse_mode=ParseMode.MARKDOWN, chat_id=update.effective_chat.id, text=answer_text) close_conn(conn) logger.info(f'Answer ready for {update.effective_user.name} ' f'at {update.effective_message.date}')
def submit_data(self): cl = self.cl.get() pl = self.pl.get() print("check 1111========================") sql = "INSERT INTO period_details(user_id,cycle_length,period_length) VALUES (%s,%s,%s)" val = (reg_user, cl, pl) insert_data(sql, val) self.master.destroy() os.system('python3 HomePage.py') messagebox.showinfo("Details", "Details Added Successfully")
def check_price(URL, desired_price): page = requests.get(URL, headers=headers) soup = BeautifulSoup(page.content, 'html.parser') product_name = soup.find(class_="productName").get_text() price = soup.select_one('div[class="Z19v"] strong').get_text(strip=True) if ("–" in price): price_clean = float(price.replace("–", "00")) else: price_clean = float(price) db.insert_data(product_name, price_clean, desired_price)
def run(self): #certs = self.get_certs_by_openssl(args.url, args.port) #certs = self.get_cert_by_scoket(args.url, args.port) certs = self.get_cert_by_ssl(args.url, args.port) result = self.parse_certs(certs) extresult = self.parse_cert_extension(certs) #print extresult cert_dict = {'_id': args.url, 'certs': certs} cert_dict.update(result) cert_dict.update(extresult) # insert to mongodb insert_data(cert_dict)
def insert_db(href, html): p = init_folder(href) c = CopyFactory(html, p, href).main() data = { 'id': c.id, 'path': c.path, "title": c.title, "cover": c.cover, "url": href, "description": c.description, 'text': extract_html_text(c.soup), } insert_data('data', data) # 插入数据到数据库
def get_weather(url): # 爬取数据 入库sql r = requests.get(url, headers={"user-agent": "mozilla/5.0"}) print(r.url) t = etree.HTML(r.text) ul = t.xpath("//ul[@class='lishitable_content clearfix']")[0] for i in ul.xpath("./li")[:-1]: temp = i.xpath("./div/text()") if len(temp) == 4: date = i.xpath("./div/a/text()")[0] if date: temp = [date] + temp db.insert_data(temp)
def download(): link1 = "http://www.aozora.gr.jp/cards/000035/files/236_19996.html" link2 = "http://www.aozora.gr.jp/cards/000035/files/1572_19910.html" link3 = "http://www.aozora.gr.jp/cards/000035/files/1578_44923.html" srcs = [textdownload(x) + [x] for x in [ link1, link2, link3, ]] db.make_table() for x in srcs: db.insert_data(*x)
def load_ads(scraper, db_conn): ads = list() GROUP_SIZE = 10 for adid in db.get_data(db_conn): a = scraper.get_ad(adid[0]) if not (a): continue ads.append(a.to_tuple()) if len(ads) == GROUP_SIZE: db.insert_data(conn=db_conn, data=ads) db.delete_data(conn=db_conn, ids=[(ad[0], ) for ad in ads]) ads = list() time.sleep(1) db.insert_data(conn=db_conn, data=ads) db.delete_data(conn=db_conn, ids=[(ad[0], ) for ad in ads])
def popup_neuer_eintrag(self): """ Ruft das Popup Neuer Eintrag und ruft ein columntree Update. """ popup = Toplevel(self) d = popup_neu.NeuerEintrag(popup) popup.update_idletasks() popup.update() popup.focus_set() popup.grab_set() popup.wait_window(popup) if d.result: db.insert_data(d.result) self.update_tabelle()
def populate_db(): for pick in tqdm(glob('dl/pickles/*.pickle')): for face in pickle(open(pick, 'rb').read()): data = {} data['folder_name'] = face['imagePath'].split('/')[-2] data['image_name'] = face['imagePath'].split('/')[-1] data['location'] = { 'y1': face['loc'][0], 'x2': face['loc'][1], 'y2': face['loc'][2], 'x1': face['loc'][3] } data['encoding'] = pickle.dumps(face['encoding']) data['tagged'] = False db.insert_data('faces', temp_dict)
def run(): ''' Entry point for the application ''' place_table = 'Places' mapzen_search = { 'url': "https://search.mapzen.com/v1/nearby", 'params': { 'layers': 'venue', 'key': os.environ.get("MAPZEN_API_KEY"), 'size': '30', 'point.lon': '-122.44674682617188', 'point.lat': '37.75280111220671', 'boundary.circle.radius': '12', 'sources': 'gn' } } mapzen_json = fetch_data(mapzen_search) mapzen_data_to_load = transform_data(mapzen_json['features']) insert_data(place_table, mapzen_data_to_load)
def run(self): """ 多线程运行的主程序 """ if self.field is None: self.field = [] # 初始化单条数据对象,循环里用于插入数据库 single_data = {} while True: # 从队列中取出一个请求 req = self.que.get_nowait() # 用于判断队列是否为空,空则退出 if req is None: break # 间隔0.5s执行爬取的请求,并解析出json数据集 time.sleep(0.5) init_content = self.s.send(req) content = json.loads(init_content.content) logger.info(req.url) # 筛选需要的字段 index = parse(self.field[0], content)[0] for i in content[index]: try: for f in self.field: single_data[f] = i[f] except TypeError: logger.error("请传入列表格式的field") break else: insert_data(single_data, info=self.info) # logger.info(single_data) single_data = {} # 给队列添加空值,方便退出 self.que.put(None) return None
def crawl_city_detail(url): response = request_util(url, 'gb2312') try: soup = BeautifulSoup(response, "lxml") info_list = soup.find('table', class_="citytable").find_all("tr", class_="citytr") for k, item in enumerate(info_list): code = item.contents[0].a.get_text().encode('utf8') name = item.contents[1].a.get_text().encode('utf8') parent_code, parent_name = get_province_code(code) level = 2 insert_data(code, name, parent_code, parent_name, level) except Exception, e: print traceback.format_exc()
def jamaalfna(url): source = requests.get(url).text soup = bs(source, 'lxml') title = soup.find('h1', {"class": "entry-title"}) date = soup.find('time', {"class": "entry-date"}) content = soup.find('div', {"class": "entry-content"}) content_p = content.findAll('p') media = soup.find('img', {"class": "wp-post-image"}) article = "\n" for p in content_p: article += p.text data = { 'Title': title.text, 'Date': date.text, 'Author': 'None', 'Content': article, 'Image': media['src'] } db.insert_data(data)
def kech24(url): source = requests.get(url).text soup = bs(source, 'lxml') title = soup.find('h2', {"itemprop": "name headline"}) date = soup.find('time', {"itemprop": "dateCreated datePublished"}) content = soup.find('div', {"itemprop": "text"}) content_p = content.findAll('p') media = soup.find('img', {"itemprop": "image"}) autor = soup.find('span', {"itemprop": "author"}) article = "\n" for p in content_p: article += p.text data = { 'Title': title.text, 'Date': date.text, 'Author': autor.txt, 'Content': article, 'Image': media['src'] } db.insert_data(data)
def alayam24(url): source = requests.get(url).text soup = bs(source, 'lxml') title = soup.find('h1', {"class": "heading"}) date = soup.find('span', {"class": "date"}) content = soup.find('div', {"class": "articlecontent"}) content_p = content.findAll('p') media = soup.find('img', {"class": "lazy"}) autor = soup.find('span', {"class": "writer"}) article = "\n" for p in content_p: article += p.text data = { 'Title': title.text, 'Date': date.text, 'Author': autor.text, 'Content': article, 'Image': media['data-original'] } db.insert_data(data)