def get_page_links2(): ''' Description: get images' second layer links ''' pageLink = page_link1_queue.get() print "page links2 process id:%s" %os.getpid() print "Starting to crawl : %s" %pageLink if pageLink: #picture_urls = [] response = requests.get(pageLink, headers=headers) soup = BeautifulSoup(response.text, "html.parser") picture_divs = soup.find_all("div", {"class":"pic"}) for picture_div in picture_divs: picture_url = picture_div.find("a").get("href") page_link2_queue.put(picture_url) #picture_urls.append(picture_url) query = session.query(SecondLevelLinks) query_result = query.filter(SecondLevelLinks.url==picture_url).first() if query_result: continue else: second_level_links = SecondLevelLinks(url=picture_url) session.add(second_level_links) session.flush() session.commit() return page_link2_queue else: return None
def get_page_links1(webUrl): ''' Description: get images' first layer links ''' init_db() htmlContent = requests.get(webUrl, headers=headers) soup = BeautifulSoup(htmlContent.text, "html.parser") wp_page_numbers_div = soup.find("div", {"id":"wp_page_numbers"}) endPageTag = wp_page_numbers_div.find_all("a")[-1] endPageLink = endPageTag.get('href') if endPageLink: regex = r"(\D+\d+\D+)(\d+)(\D+)" m = re.match(regex, endPageLink) if m: pageNumber = int(m.groups()[1]) #get page number for index in xrange(1, pageNumber+1): pageLink = "%s"*4 %(webUrl, m.group(1), index, m.group(3)) #pageLinks.append(pageLink) page_link1_queue.put(pageLink) query = session.query(FirstLevelLinks) query_result = query.filter(FirstLevelLinks.url==pageLink).first() if query_result: continue else: first_level_links = FirstLevelLinks(url=pageLink) session.add(first_level_links) session.flush() session.commit() return page_link1_queue else: return None
def addBasePokemonToUser(self, addRequestDict): try: userId = addRequestDict.get("userId") pokedexId = addRequestDict.get("pokedexId") pokemon = self.pokemonService.getPokemonByPokedexId(pokedexId) if pokemon is None: raise Exception("Pokemon with pokedexId:" + str(pokedexId) + " was not found!") userPokemon = UserPokemon() userPokemon.userId = userId userPokemon.pokedexId = pokedexId userPokemon.name = pokemon.name userPokemon.nickname = addRequestDict.get("nickname", pokemon.name) userPokemon.height = pokemon.height userPokemon.sprites = pokemon.sprites userPokemon.weight = pokemon.weight userPokemon.hunger = sysConst.MAX_HUNGER userPokemon.maxHp = pokemon.hp userPokemon.currentHp = pokemon.hp userPokemon.attack = pokemon.attack userPokemon.defense = pokemon.defense userPokemon.specialAttack = pokemon.specialAttack userPokemon.specialDefense = pokemon.specialDefense userPokemon.speed = pokemon.speed userPokemon.healthState = sysConst.ALIVE_POKEMON_STATE session.add(userPokemon) session.flush() session.commit() return userPokemon.id except Exception as e: session.rollback() raise e
def html_stats(): sitemaps = Sitemap.get_by_status(SitemapTaskStatus.before_html_stats) for sitemap in sitemaps: headers = {'User-Agent': BaseConfig.UA} r = requests.get(urllib.parse.quote(sitemap.loc, safe=':/'), headers=headers) soup = BeautifulSoup(r.text, 'html.parser') text = ''.join(soup.findAll(text=True)) html_stats = HtmlStats() html_stats.loc = sitemap.loc html_stats.domain = sitemap.domain html_stats.path = sitemap.path html_stats.word_count = len(text) html_stats.number_of_punctuation_marks = (text.count("、") + text.count("。")) html_stats.number_of_h1 = len(soup.find_all('h1')) html_stats.number_of_h2 = len(soup.find_all('h2')) html_stats.number_of_h3 = len(soup.find_all('h3')) html_stats.number_of_h4 = len(soup.find_all('h4')) html_stats.number_of_h5 = len(soup.find_all('h5')) html_stats.number_of_h6 = len(soup.find_all('h6')) html_stats.number_of_table = len(soup.find_all('table')) html_stats.number_of_li = len(soup.find_all('li')) html_stats.number_of_dl = len(soup.find_all('dl')) html_stats.number_of_image = len(soup.find_all('img')) html_stats.number_of_a = len(soup.find_all('a')) html_stats.number_of_iframe = len(soup.find_all('iframe')) html_stats.update_at = datetime.datetime.now() sitemap.status = SitemapTaskStatus.task_done session.merge(html_stats) session.flush() session.commit()
def create_or_update_account(updated_account): existing_account = session \ .query(Account) \ .filter_by(google_id=updated_account.google_id) \ .first() if not existing_account: try: logging.info("Account does not exist in the system!") session.add(updated_account) session.flush() session.commit() existing_account = updated_account except IntegrityError: session.rollback() logging.info('User already exists in the system!') existing_account = session \ .query(Account) \ .filter_by(google_id=updated_account.google_id) \ .first() return { 'token': jwt_token_utils.create_jwt_token(existing_account) }, HTTPStatus.OK
def create_test_user(): user = User(social_profile_id=int(time.time()), social_profile_type='facebook', first_name='Jian', last_name='Yang', profile_image='https://i.kym-cdn.com/photos/images/newsfeed/001/130/355/dca.jpg') session.add(user) session.flush() session.commit()
def add_csv(): """adds data from csv to db. if conflict, most recent entry is saved. """ with open('inventory.csv') as csvfile: data = csv.reader(csvfile) next(data, None) for row in data: product_in_db = session.query(Product).filter( Product.product_name == row[0]).one_or_none() name = row[0] price = clean_price(row[1]) quantity = int(row[2]) date = clean_date(row[3]) new_product = Product(product_name=name, product_quantity=quantity, product_price=price, date_updated=date) if product_in_db is not None: db_time = product_in_db.date_updated db_time = datetime.datetime(db_time.year, db_time.month, db_time.day) if date > db_time: session.query(Product).filter( Product.product_name == row[0]).delete() session.add(new_product) else: session.add(new_product) session.flush() session.commit()
def save_sitemap(url_object): url = url_object.get("value") headers = {'User-Agent': BaseConfig.UA} r = requests.get(url, headers=headers) soup = BeautifulSoup(r.text, 'html.parser') urls = soup.find_all("url") for url in urls: if not url.loc: raise ValueError("url.loc is None...") parse_result = urllib.parse.urlparse(url.loc.text) sitemap = session.query(Sitemap).filter(Sitemap.loc == urllib.parse.unquote(url.loc.text)).first() if sitemap is None: sitemap = Sitemap() sitemap.loc = urllib.parse.unquote(url.loc.text) sitemap.lastmod = strptime(url.lastmod.text) if url.lastmod else None sitemap.change_freq = url.changefreq.text if url.changefreq else None sitemap.priority = Decimal(url.priority.text) if url.priority else None sitemap.domain = parse_result.netloc sitemap.path = urllib.parse.unquote(parse_result.path) sitemap.status = SitemapTaskStatus.before_html_stats session.add(sitemap) elif sitemap.lastmod is None or sitemap.lastmod > strptime(url.lastmod.text) if url.lastmod else None : sitemap.lastmod = strptime(url.lastmod.text) if url.lastmod else None sitemap.status = SitemapTaskStatus.before_html_stats session.merge(sitemap) session.flush() session.commit()
def post_photo(user, request): yelp_id = request.form.get('yelp_id') if not yelp_id: return {'error': 'Missing yelp_id.'}, HTTP_STATUS_BAD_REQUEST restaurant = yelp_client.get_restaurant(yelp_id) if not restaurant: return {'error': 'Invalid yelp_id'}, HTTP_STATUS_INTERNAL_SERVER_ERROR if 'image' not in request.files: return {'error': 'No image attached!'}, HTTP_STATUS_BAD_REQUEST file = request.files.get('image') try: user_id = user.id photo_url = s3_client.upload_photo(user_id, file) photo = Photo(user_id=user_id, photo_url=photo_url, yelp_id=restaurant.id, restaurant_name=restaurant.name) session.add(photo) session.flush() session.commit() return photo.to_dict(), HTTP_STATUS_OK except S3UploadFailedError as e: logging.error(str(e)) return {'error': 'Failed to upload image!'}, HTTP_STATUS_BAD_REQUEST
def nominate(self): if ServerManager.has_active_server(): return post_message( 'Someone else is already making tea, I\'ll save your nomination for later :smile:', self.channel) try: slack_id = MENTION_RE.search(self.command_body).groups()[0] except AttributeError: return post_message('You must nominate another user to brew!', self.channel) nominated_user = UserManager.get_by_slack_id(slack_id) if self.request_user.nomination_points < NOMINATION_POINTS_REQUIRED: return post_message( 'You can\'t nominate someone unless you brew tea %s times!' % NOMINATION_POINTS_REQUIRED, self.channel) # Subtract nomination points from request user. nominated_user.nomination_points -= NOMINATION_POINTS_REQUIRED server = Server(user_id=nominated_user.id) session.add(server) session.flush() session.add(Customer(user_id=self.request_user.id, server_id=server.id)) session.commit() brew_countdown.apply_async(countdown=BREW_COUNTDOWN, args=(self.channel, )) return post_message( '%s has nominated %s to make tea! Who wants in?' % (self.request_user.first_name, nominated_user.first_name), self.channel)
def create_or_update_existing_account(facebook_account): user = session \ .query(User) \ .filter_by(social_profile_id=facebook_account.id, social_profile_type='facebook') \ .first() if not user: try: user = User(social_profile_id=facebook_account.id, social_profile_type='facebook', first_name=facebook_account.first_name, last_name=facebook_account.last_name, profile_image=facebook_account.profile_image) session.add(user) session.flush() session.commit() except IntegrityError: session.rollback() logging.warning('User already exists in the system!') user = session \ .query(User) \ .filter_by(social_profile_id=facebook_account.id, social_profile_type='facebook') \ .first() return { 'token': savory_token_client.create_savory_token(user) }, HTTP_STATUS_OK
def _end_task(self, task, project): ut = session.query(UserTask).get(self.current_ut) ut.end_date = datetime.now() session.flush() self.current_ut = 0 # Update the text on the button self.task_button_text.set('Start Task')
def create_user_book_mapping_record(user_id=None, book_id=None): user_book_map_obj = UserBookMapping( user_id=user_id, book_id=book_id ) session.add(user_book_map_obj) session.flush() return user_book_map_obj
def create_user_entry(**kwargs): hash_password = pbkdf2_sha256.hash(kwargs.get("password")) user_obj = User(username=kwargs.get("user_name"), password=hash_password, email=kwargs.get("email")) session.add(user_obj) session.flush() return user_obj
def post_bank_info(**kwargs): """ This method is used to insert entry to the database :param kwargs: :return: return with id and status """ bank_info = BankTable(**kwargs) session.add(bank_info) session.flush() return dict(id=bank_info.id, status=True)
def delete_bank_info(bank_id): """ This method is used to delete the entry from bank info :param bank_id: :return: """ session.query(BankTable).filter(BankTable.id == bank_id, BankTable.is_deleted == 0).update( {"is_deleted": 1}) session.flush() return dict(status=True)
def get_user_agent_id(user_agent): user_agent = str(user_agent) if len(user_agent) > 60: user_agent = user_agent[:60] if user_agent not in USER_AGENTS: u = session.query(UserAgent).filter( UserAgent.user_agent == user_agent).first() if u is None: u = UserAgent(user_agent=user_agent) session.add(u) session.flush() logging.info(f"New User Agent > {u.id} {u.user_agent}") USER_AGENTS[str(user_agent)] = int(u.id) return USER_AGENTS[user_agent]
def download_images(): print "download process id: %s" %os.getpid() images_url = page_link2_queue.get() print "Starting to crawl : %s" %images_url if images_url: response = requests.get(images_url, headers=headers) soup = BeautifulSoup(response.text, "html.parser") image_div = soup.find("div", {"id":"picture"}) image_links = image_div.find_all("img") images_names = [] for image_link in image_links: image_source = image_link.get("src") regex = r"(\D+)(\d+)(\D+)(\d+)(\D+)(\d+)(\D+)(\d+)(\D+)" m = re.match(regex, image_source) if m: image_name = "%s_%s_%s_%s%s" %(m.group(2), m.group(4), \ m.group(6), m.group(8), m.group(9)) r = requests.get(image_source, headers=headers, stream=True) fname = os.path.join(DOWNLOAD_DIR, image_name) #download images if not os.path.exists(fname): with open(fname, "wb") as fd: for chunk in r.iter_content(): fd.write(chunk) print "%s has been downloaded." %image_name else: print "%s already exists." %image_name continue md5 = create_md5(fname) #create md5 for picture query_result = session.query(ImageInfo).filter_by(md5=md5).all() if not query_result: #link1_id = session.query(FirstLevelLinks).filter(FirstLevelLinks.url==page_link).first().id link2_id = session.query(SecondLevelLinks).filter(SecondLevelLinks.url==images_url).first().id image_info = ImageInfo(name=image_name, md5=md5, url=image_source, second_level_link_id=link2_id) session.add(image_info) session.flush() session.commit() else: return None
def follow_user(follower_user, followed_user): follow_relationship = FollowRelationship(follower_user_id=follower_user.id, followed_user_id=followed_user.id) try: session.add(follow_relationship) session.flush() session.commit() except IntegrityError: logging.info('Follow relationship already exists in the system!') session.rollback() session.query(FollowRelationship) \ .filter_by(follower_user_id=follower_user.id, followed_user_id=followed_user.id)\ .update({'is_deleted': False}) session.commit() return {'following': True}, HTTP_STATUS_OK
def put_bank_info(**kwargs): """ This method is used to update the bank table row :param kwargs: :return: return id and status """ bank_id = kwargs.pop('bank_id') update_dict = dict() if kwargs.get('name'): update_dict.update(name=kwargs.get("name")) if kwargs.get('address'): update_dict.update(address=kwargs.get("address")) if kwargs.get('mobile_number'): update_dict.update(mobile_number=kwargs.get("mobile_number")) if kwargs.get('bank_manager'): update_dict.update(bank_manager=kwargs.get("bank_manager")) session.query(BankTable).filter( BankTable.id == bank_id, BankTable.is_deleted == 0).update(update_dict) session.flush() return dict(status=True)
""" Demo #2 Add a new Actor to the database Verify that it's added """ rick = Actor() rick.first_name = "Rick" rick.last_name = "Harding" session.add(rick) # notice that we flush here, but we don't commit. The change won't actually # make it to the db so when we rerun this script, it'll still report 200/201 session.flush() ct = Actor.query.count() plog("Inserted Rick, New Count: {0} ".format(ct)) """ Homework Now that the new actor 'Rick' is added, remove him and verify that you get back 200 actors in the database """ session.delete(rick) ct = Actor.query.count() plog("Actor Count via .count(): {0} ".format(ct))
def update_user_book_mapping_record(mapping_obj): mapping_obj.is_deleted = DEFAULT_FALSE_FLAG session.flush() return mapping_obj