コード例 #1
0
ファイル: crawler.py プロジェクト: zhengze/imagecrawler
def get_page_links2():
    '''
        Description: get images' second layer links
    '''
    pageLink = page_link1_queue.get()
    print "page links2 process id:%s" %os.getpid()
    print "Starting to crawl : %s" %pageLink
    if pageLink:
        #picture_urls = []  
        response = requests.get(pageLink, headers=headers) 
        soup = BeautifulSoup(response.text, "html.parser")
        picture_divs = soup.find_all("div", {"class":"pic"})
        for picture_div in picture_divs:
            picture_url = picture_div.find("a").get("href")
            page_link2_queue.put(picture_url)
            #picture_urls.append(picture_url)

            query = session.query(SecondLevelLinks)
            query_result = query.filter(SecondLevelLinks.url==picture_url).first()
            if query_result:
                continue
            else:
                second_level_links = SecondLevelLinks(url=picture_url)
                session.add(second_level_links)
        session.flush()
        session.commit()

        return page_link2_queue
    else:
        return None
コード例 #2
0
ファイル: crawler.py プロジェクト: zhengze/imagecrawler
def get_page_links1(webUrl):
    '''
        Description: get images' first layer links
    '''
    init_db()
    htmlContent = requests.get(webUrl, headers=headers)
    soup = BeautifulSoup(htmlContent.text, "html.parser")
    wp_page_numbers_div = soup.find("div", {"id":"wp_page_numbers"})
    endPageTag = wp_page_numbers_div.find_all("a")[-1]
    endPageLink = endPageTag.get('href')
    if endPageLink: 
        regex = r"(\D+\d+\D+)(\d+)(\D+)"
        m = re.match(regex, endPageLink)
        if m:
            pageNumber = int(m.groups()[1])  #get page number
            for index in xrange(1, pageNumber+1):
                pageLink = "%s"*4 %(webUrl, m.group(1), index, m.group(3))
                #pageLinks.append(pageLink)
                page_link1_queue.put(pageLink)
                query = session.query(FirstLevelLinks)
                query_result = query.filter(FirstLevelLinks.url==pageLink).first()
                if query_result:
                    continue 
                else:
                    first_level_links = FirstLevelLinks(url=pageLink)
                    session.add(first_level_links)
            session.flush()
            session.commit()
            return page_link1_queue
        else:
            return None
コード例 #3
0
    def addBasePokemonToUser(self, addRequestDict):
        try:
            userId = addRequestDict.get("userId")
            pokedexId = addRequestDict.get("pokedexId")

            pokemon = self.pokemonService.getPokemonByPokedexId(pokedexId)
            if pokemon is None:
                raise Exception("Pokemon with pokedexId:" + str(pokedexId) +
                                " was not found!")

            userPokemon = UserPokemon()
            userPokemon.userId = userId
            userPokemon.pokedexId = pokedexId
            userPokemon.name = pokemon.name
            userPokemon.nickname = addRequestDict.get("nickname", pokemon.name)
            userPokemon.height = pokemon.height
            userPokemon.sprites = pokemon.sprites
            userPokemon.weight = pokemon.weight
            userPokemon.hunger = sysConst.MAX_HUNGER
            userPokemon.maxHp = pokemon.hp
            userPokemon.currentHp = pokemon.hp
            userPokemon.attack = pokemon.attack
            userPokemon.defense = pokemon.defense
            userPokemon.specialAttack = pokemon.specialAttack
            userPokemon.specialDefense = pokemon.specialDefense
            userPokemon.speed = pokemon.speed
            userPokemon.healthState = sysConst.ALIVE_POKEMON_STATE

            session.add(userPokemon)
            session.flush()
            session.commit()
            return userPokemon.id
        except Exception as e:
            session.rollback()
            raise e
コード例 #4
0
def html_stats():

    sitemaps = Sitemap.get_by_status(SitemapTaskStatus.before_html_stats)
    for sitemap in sitemaps:

        headers = {'User-Agent': BaseConfig.UA}
        r = requests.get(urllib.parse.quote(sitemap.loc, safe=':/'),
                         headers=headers)
        soup = BeautifulSoup(r.text, 'html.parser')
        text = ''.join(soup.findAll(text=True))
        html_stats = HtmlStats()
        html_stats.loc = sitemap.loc
        html_stats.domain = sitemap.domain
        html_stats.path = sitemap.path
        html_stats.word_count = len(text)
        html_stats.number_of_punctuation_marks = (text.count("、") +
                                                  text.count("。"))
        html_stats.number_of_h1 = len(soup.find_all('h1'))
        html_stats.number_of_h2 = len(soup.find_all('h2'))
        html_stats.number_of_h3 = len(soup.find_all('h3'))
        html_stats.number_of_h4 = len(soup.find_all('h4'))
        html_stats.number_of_h5 = len(soup.find_all('h5'))
        html_stats.number_of_h6 = len(soup.find_all('h6'))
        html_stats.number_of_table = len(soup.find_all('table'))
        html_stats.number_of_li = len(soup.find_all('li'))
        html_stats.number_of_dl = len(soup.find_all('dl'))
        html_stats.number_of_image = len(soup.find_all('img'))
        html_stats.number_of_a = len(soup.find_all('a'))
        html_stats.number_of_iframe = len(soup.find_all('iframe'))
        html_stats.update_at = datetime.datetime.now()
        sitemap.status = SitemapTaskStatus.task_done
        session.merge(html_stats)
        session.flush()

    session.commit()
コード例 #5
0
def create_or_update_account(updated_account):
    existing_account = session \
        .query(Account) \
        .filter_by(google_id=updated_account.google_id) \
        .first()

    if not existing_account:
        try:
            logging.info("Account does not exist in the system!")
            session.add(updated_account)
            session.flush()
            session.commit()

            existing_account = updated_account
        except IntegrityError:
            session.rollback()
            logging.info('User already exists in the system!')

            existing_account = session \
                .query(Account) \
                .filter_by(google_id=updated_account.google_id) \
                .first()

    return {
        'token': jwt_token_utils.create_jwt_token(existing_account)
    }, HTTPStatus.OK
コード例 #6
0
def create_test_user():
    user = User(social_profile_id=int(time.time()), social_profile_type='facebook',
                first_name='Jian', last_name='Yang',
                profile_image='https://i.kym-cdn.com/photos/images/newsfeed/001/130/355/dca.jpg')
    session.add(user)
    session.flush()
    session.commit()
コード例 #7
0
ファイル: app.py プロジェクト: bonanob/store_inventory
def add_csv():
    """adds data from csv to db.
    if conflict, most recent entry is saved.
    """
    with open('inventory.csv') as csvfile:
        data = csv.reader(csvfile)
        next(data, None)
        for row in data:
            product_in_db = session.query(Product).filter(
                Product.product_name == row[0]).one_or_none()
            name = row[0]
            price = clean_price(row[1])
            quantity = int(row[2])
            date = clean_date(row[3])
            new_product = Product(product_name=name,
                                  product_quantity=quantity,
                                  product_price=price,
                                  date_updated=date)

            if product_in_db is not None:
                db_time = product_in_db.date_updated
                db_time = datetime.datetime(db_time.year, db_time.month,
                                            db_time.day)
                if date > db_time:
                    session.query(Product).filter(
                        Product.product_name == row[0]).delete()
                    session.add(new_product)
            else:
                session.add(new_product)
            session.flush()
        session.commit()
コード例 #8
0
def save_sitemap(url_object):
    url = url_object.get("value")
    headers = {'User-Agent': BaseConfig.UA}
    r = requests.get(url, headers=headers)
    soup = BeautifulSoup(r.text, 'html.parser')
    urls = soup.find_all("url")
    for url in urls:
        if not url.loc:
            raise ValueError("url.loc is None...")

        parse_result = urllib.parse.urlparse(url.loc.text)
        sitemap = session.query(Sitemap).filter(Sitemap.loc == urllib.parse.unquote(url.loc.text)).first()

        if sitemap is None:
            sitemap = Sitemap()
            sitemap.loc = urllib.parse.unquote(url.loc.text)
            sitemap.lastmod = strptime(url.lastmod.text) if url.lastmod else None
            sitemap.change_freq = url.changefreq.text if url.changefreq else None
            sitemap.priority = Decimal(url.priority.text) if url.priority else None
            sitemap.domain = parse_result.netloc
            sitemap.path = urllib.parse.unquote(parse_result.path)
            sitemap.status = SitemapTaskStatus.before_html_stats
            session.add(sitemap)
        elif sitemap.lastmod is None or sitemap.lastmod > strptime(url.lastmod.text) if url.lastmod else None :
            sitemap.lastmod = strptime(url.lastmod.text) if url.lastmod else None
            sitemap.status = SitemapTaskStatus.before_html_stats
            session.merge(sitemap)
        session.flush()

    session.commit()
コード例 #9
0
def post_photo(user, request):
    yelp_id = request.form.get('yelp_id')
    if not yelp_id:
        return {'error': 'Missing yelp_id.'}, HTTP_STATUS_BAD_REQUEST

    restaurant = yelp_client.get_restaurant(yelp_id)
    if not restaurant:
        return {'error': 'Invalid yelp_id'}, HTTP_STATUS_INTERNAL_SERVER_ERROR

    if 'image' not in request.files:
        return {'error': 'No image attached!'}, HTTP_STATUS_BAD_REQUEST

    file = request.files.get('image')

    try:
        user_id = user.id

        photo_url = s3_client.upload_photo(user_id, file)

        photo = Photo(user_id=user_id,
                      photo_url=photo_url,
                      yelp_id=restaurant.id,
                      restaurant_name=restaurant.name)

        session.add(photo)
        session.flush()
        session.commit()

        return photo.to_dict(), HTTP_STATUS_OK
    except S3UploadFailedError as e:
        logging.error(str(e))
        return {'error': 'Failed to upload image!'}, HTTP_STATUS_BAD_REQUEST
コード例 #10
0
    def nominate(self):
        if ServerManager.has_active_server():
            return post_message(
                'Someone else is already making tea, I\'ll save your nomination for later :smile:',
                self.channel)

        try:
            slack_id = MENTION_RE.search(self.command_body).groups()[0]
        except AttributeError:
            return post_message('You must nominate another user to brew!',
                                self.channel)

        nominated_user = UserManager.get_by_slack_id(slack_id)
        if self.request_user.nomination_points < NOMINATION_POINTS_REQUIRED:
            return post_message(
                'You can\'t nominate someone unless you brew tea %s times!' %
                NOMINATION_POINTS_REQUIRED, self.channel)

        # Subtract nomination points from request user.
        nominated_user.nomination_points -= NOMINATION_POINTS_REQUIRED

        server = Server(user_id=nominated_user.id)
        session.add(server)
        session.flush()
        session.add(Customer(user_id=self.request_user.id,
                             server_id=server.id))
        session.commit()
        brew_countdown.apply_async(countdown=BREW_COUNTDOWN,
                                   args=(self.channel, ))

        return post_message(
            '%s has nominated %s to make tea! Who wants in?' %
            (self.request_user.first_name, nominated_user.first_name),
            self.channel)
コード例 #11
0
def create_or_update_existing_account(facebook_account):
    user = session \
        .query(User) \
        .filter_by(social_profile_id=facebook_account.id,
                   social_profile_type='facebook') \
        .first()

    if not user:
        try:
            user = User(social_profile_id=facebook_account.id,
                        social_profile_type='facebook',
                        first_name=facebook_account.first_name,
                        last_name=facebook_account.last_name,
                        profile_image=facebook_account.profile_image)
            session.add(user)
            session.flush()
            session.commit()
        except IntegrityError:
            session.rollback()
            logging.warning('User already exists in the system!')

            user = session \
                .query(User) \
                .filter_by(social_profile_id=facebook_account.id,
                           social_profile_type='facebook') \
                .first()

    return {
        'token': savory_token_client.create_savory_token(user)
    }, HTTP_STATUS_OK
コード例 #12
0
    def _end_task(self, task, project):
        ut = session.query(UserTask).get(self.current_ut)
        ut.end_date = datetime.now()
        session.flush()
        self.current_ut = 0

        # Update the text on the button
        self.task_button_text.set('Start Task')
コード例 #13
0
def create_user_book_mapping_record(user_id=None, book_id=None):
    user_book_map_obj = UserBookMapping(
        user_id=user_id,
        book_id=book_id
    )
    session.add(user_book_map_obj)
    session.flush()
    return user_book_map_obj
コード例 #14
0
ファイル: users.py プロジェクト: anandtripathi5/book_store
def create_user_entry(**kwargs):
    hash_password = pbkdf2_sha256.hash(kwargs.get("password"))
    user_obj = User(username=kwargs.get("user_name"),
                    password=hash_password,
                    email=kwargs.get("email"))
    session.add(user_obj)
    session.flush()
    return user_obj
コード例 #15
0
def post_bank_info(**kwargs):
    """
    This method is used to insert entry to the database
    :param kwargs:
    :return: return with id and status
    """
    bank_info = BankTable(**kwargs)
    session.add(bank_info)
    session.flush()
    return dict(id=bank_info.id, status=True)
コード例 #16
0
def delete_bank_info(bank_id):
    """
    This method is used to delete the entry from bank info
    :param bank_id:
    :return:
    """
    session.query(BankTable).filter(BankTable.id == bank_id,
                                    BankTable.is_deleted == 0).update(
                                        {"is_deleted": 1})
    session.flush()
    return dict(status=True)
コード例 #17
0
def get_user_agent_id(user_agent):
    user_agent = str(user_agent)
    if len(user_agent) > 60:
        user_agent = user_agent[:60]
    if user_agent not in USER_AGENTS:
        u = session.query(UserAgent).filter(
            UserAgent.user_agent == user_agent).first()
        if u is None:
            u = UserAgent(user_agent=user_agent)
            session.add(u)
            session.flush()
            logging.info(f"New User Agent > {u.id} {u.user_agent}")
        USER_AGENTS[str(user_agent)] = int(u.id)
    return USER_AGENTS[user_agent]
コード例 #18
0
ファイル: crawler.py プロジェクト: zhengze/imagecrawler
def download_images():
    print "download process id: %s" %os.getpid()
    images_url = page_link2_queue.get()
    print "Starting to crawl : %s" %images_url
    if images_url:
        response = requests.get(images_url, headers=headers)
        soup = BeautifulSoup(response.text, "html.parser")
        image_div = soup.find("div", {"id":"picture"})
        image_links = image_div.find_all("img")
        images_names = []
        for image_link in image_links:
            image_source = image_link.get("src")
            regex = r"(\D+)(\d+)(\D+)(\d+)(\D+)(\d+)(\D+)(\d+)(\D+)"
            m = re.match(regex, image_source)
            if m:
                image_name = "%s_%s_%s_%s%s" %(m.group(2), m.group(4), \
                    m.group(6), m.group(8), m.group(9)) 
                r = requests.get(image_source, headers=headers, stream=True)

                fname = os.path.join(DOWNLOAD_DIR, image_name)

                #download images
                if not os.path.exists(fname):
                    with open(fname, "wb") as fd:
                        for chunk in r.iter_content():
                            fd.write(chunk)
                    print "%s has been downloaded." %image_name

                else:
                    print "%s already exists." %image_name
                    continue

                md5 = create_md5(fname) #create md5 for picture
                query_result = session.query(ImageInfo).filter_by(md5=md5).all()
                if not query_result:
                    #link1_id = session.query(FirstLevelLinks).filter(FirstLevelLinks.url==page_link).first().id
                    link2_id = session.query(SecondLevelLinks).filter(SecondLevelLinks.url==images_url).first().id

                    image_info = ImageInfo(name=image_name, md5=md5, url=image_source, second_level_link_id=link2_id)
                    session.add(image_info)
                    session.flush()
                    session.commit()
                    
            else:
                return None
コード例 #19
0
def follow_user(follower_user, followed_user):
    follow_relationship = FollowRelationship(follower_user_id=follower_user.id,
                                             followed_user_id=followed_user.id)

    try:
        session.add(follow_relationship)
        session.flush()
        session.commit()
    except IntegrityError:
        logging.info('Follow relationship already exists in the system!')
        session.rollback()
        session.query(FollowRelationship) \
            .filter_by(follower_user_id=follower_user.id,
                       followed_user_id=followed_user.id)\
            .update({'is_deleted': False})

        session.commit()

    return {'following': True}, HTTP_STATUS_OK
コード例 #20
0
def put_bank_info(**kwargs):
    """
    This method is used to update the bank table row
    :param kwargs:
    :return: return id and status
    """
    bank_id = kwargs.pop('bank_id')
    update_dict = dict()
    if kwargs.get('name'):
        update_dict.update(name=kwargs.get("name"))
    if kwargs.get('address'):
        update_dict.update(address=kwargs.get("address"))
    if kwargs.get('mobile_number'):
        update_dict.update(mobile_number=kwargs.get("mobile_number"))
    if kwargs.get('bank_manager'):
        update_dict.update(bank_manager=kwargs.get("bank_manager"))
    session.query(BankTable).filter(
        BankTable.id == bank_id, BankTable.is_deleted == 0).update(update_dict)
    session.flush()
    return dict(status=True)
コード例 #21
0
"""
Demo #2

Add a new Actor to the database
Verify that it's added

"""
rick = Actor()
rick.first_name = "Rick"
rick.last_name = "Harding"

session.add(rick)
# notice that we flush here, but we don't commit. The change won't actually
# make it to the db so when we rerun this script, it'll still report 200/201
session.flush()

ct = Actor.query.count()
plog("Inserted Rick, New Count: {0} ".format(ct))


"""
Homework

Now that the new actor 'Rick' is added, remove him and verify that you get back
200 actors in the database
"""

session.delete(rick)
ct = Actor.query.count()
plog("Actor Count via .count(): {0} ".format(ct))
コード例 #22
0
def update_user_book_mapping_record(mapping_obj):
    mapping_obj.is_deleted = DEFAULT_FALSE_FLAG
    session.flush()
    return mapping_obj