コード例 #1
0
def get_data(movie_id):
    url = "https://movie.douban.com/subject/{}".format(movie_id)
    r = requests.get(url, headers=HEADERS)
    if r.status_code != 200:
        raise Exception(r.status_code)
    content = r.content
    s = content.find("<span class=\"year\">")
    s = content.find("(", s + 1)
    e = content.find("</span>", s + 1)
    year = int(content[s + 1:e - 1])
    s = content.find("<script type=\"application/ld+json\">")
    s = content.find("{", s + 1)
    e = content.find("</script>", s + 1)
    json_data = json.loads(content[s:e - 1], strict=False)
    name = json_data.get(u'name')
    score = json_data.get(u'aggregateRating').get(u'ratingValue')
    if score:
        score = float(score)
    else:
        score = "NULL"
    votes = json_data.get(u'aggregateRating').get(u'ratingCount')
    if votes:
        votes = int(votes)
    else:
        votes = "NULL"
    if db.check_exists(movie_id):
        db.update_data(movie_id, name, year, score, votes)
    else:
        db.insert_data(movie_id, name, year, score, votes)
    logging.info("finish %s %s %s %s %s", movie_id, name, year, score, votes)
    global backup_list
    if len(backup_list) < 100:
        find_next(content)
コード例 #2
0
ファイル: community.py プロジェクト: diligent2012/edge_bate
def crawl_community_list():
   
    global street_code_list
    street_code_list = query_data(4)
    
    try:
        page_urls = generate_page_url()
        for k,page_item in enumerate(page_urls):
            page_url = page_item['page_url']
            print page_url
            if (page_url in special_url_conn):
                for item in special_url:
                    response = request_util(item['page_url'],item['encoding']);
            else:
                response = request_util(page_url,'gbk');
            soup = BeautifulSoup(response, "lxml")
            info_list = soup.find('table',class_="villagetable").find_all("tr",class_="villagetr")
            for k,item in enumerate(info_list):
               
                #street_url = street_url_prefix + item.contents[0].a.attrs['href'].encode('utf8')
                code = item.contents[0].get_text().encode('utf8')
                name = item.contents[2].get_text().encode('utf8')
                parent_code,parent_name = get_street_code(code)
                level = 5
                print code, name, parent_code, parent_name
                insert_data(code, name, parent_code, parent_name, level)
           
    except Exception, e:
        print traceback.format_exc()
コード例 #3
0
def crawl_district_list():
    global city_code_list
    city_code_list = query_data(2)

    try:
        page_urls = generate_page_url()
        for k, page_url in enumerate(page_urls):
            print page_url
            response = request_util(page_url, 'gbk')
            soup = BeautifulSoup(response, "lxml")
            info_list = soup.find('table', class_="countytable").find_all(
                "tr", class_="countytr")
            for k, item in enumerate(info_list):

                if item.contents[0].find('a', {'href': True}):
                    #street_url = common_url_prefix + url_code + item.contents[0].a.attrs['href'].encode('utf8')
                    code = item.contents[0].a.get_text().encode('utf8')

                    name = item.contents[1].a.get_text().encode('utf8')
                    parent_code, parent_name = get_city_code(code)
                    level = 3
                    print code, name, parent_code, parent_name
                    insert_data(code, name, parent_code, parent_name, level)

                    #crawl_street_detail(street_url)

    except Exception, e:
        print traceback.format_exc()
コード例 #4
0
def isLogin():
    # 通过查看用户个人信息来判断是否已经登录
    timestamp = time.time() * 1000
    url = "https://dig.chouti.com/getTopTenLinksOrComments.json?_=%s" % timestamp
    # 禁止重定向,否则登录失败重定向到首页也是响应200
    response = session.get(url, headers=headers, allow_redirects=False)
    resp_json = response.json()
    data = resp_json.get("result")
    data = data.get("data")
    # print(data[1])\
    chouti_ids = []
    for exits in data:
        chouti_id = exits.get("id")
        chouti_ids.append(chouti_id)
    exits_ids = get_data_exits(*chouti_ids)
    end_ids = list(set(chouti_ids).difference(set(exits_ids)))
    for row in data:
        # 抽屉id
        chouti_id = row.get("id")
        chouti_content = row
        if chouti_id not in end_ids:
            pprint.pprint(chouti_content)
            continue
        # 抽屉内容
        chouti_comments = comments(chouti_id)
        # 抽屉评论
        save_data = dict(chouti_id=chouti_id,
                         chouti_content=chouti_content,
                         chouti_comments=chouti_comments)
        insert_data(**save_data)
    login_code = response.status_code
    if login_code == 200:
        return True
    else:
        return False
コード例 #5
0
def crawl_province_list():
    response = request_util(province_url)
    try:
        soup = BeautifulSoup(response, "lxml")
        info_list = soup.find('div', class_="TRS_PreAppend").find_all(
            "p", class_="MsoNormal")
        for k, item in enumerate(info_list):

            code_item = {}
            code = item.find("span", attrs={'lang': 'EN-US'})
            code_item['code'] = code.get_text().strip()
            content_list = item.find_all("span")
            code_item['name'] = content_list[len(content_list) -
                                             1].get_text().strip()
            code_list.append(code_item)

        for k, item in enumerate(code_list):
            if item['code'].find("0000") > 0:
                code = item['code'].encode('utf8') + "000000"
                name = item['name'].encode('utf8')
                parent_code = 0
                parent_name = ""
                level = 1
                # p_code_item = {}
                # p_code_item['code'] = code
                # p_code_item['name'] = name
                # province_code_list.append(p_code_item)
                insert_data(code, name, parent_code, parent_name, level)

    except Exception, e:
        print traceback.format_exc()
コード例 #6
0
ファイル: crawler.py プロジェクト: liuny05/Spider_Zhihu
    def output(self):
        '''Store data to MongoDB.'''

        data = {
            'user_url': self.url,
            'user_name': self.user_name,
            'user_gender': self.user_gender,
            'user_location': self.user_location,
            'user_followees': self.user_followees,
            'user_followers': self.user_followers,
            'user_be_agreed': self.user_be_agreed,
            'user_be_thanked': self.user_be_thanked,
            'user_education_school': self.user_education_school,
            'user_education_subject': self.user_education_subject,
            'user_employment': self.user_employment,
            'user_employment_extra': self.user_employment_extra,
            'user_bio': self.user_bio,
            'user_content': self.user_content,
            'user_topics': self.user_topics,
            'user_answers': self.user_answers,
            'user_topics_num': self.user_topics_num,
            'user_questions_num': self.user_questions_num,
            'user_answers_num': self.user_answers_num,
            'user_articles_num': self.user_articles_num,
            'user_favorites_num': self.user_favorites_num
        }

        insert_data(data)
コード例 #7
0
ファイル: street.py プロジェクト: diligent2012/edge_bate
def crawl_street_list():
    # page_url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/52/03/520324.html"
    # response = request_util(page_url,'gb18030');
    # print response
    # return
    global district_code_list
    district_code_list = query_data(3)

    
    try:
        page_urls = generate_page_url()
        for k,page_item in enumerate(page_urls):
            page_url = page_item['page_url']
            print page_url
            if (page_url in special_url_conn):
                for item in special_url:
                    response = request_util(item['page_url'],item['encoding']);
            else:
                response = request_util(page_url,'gbk');
            soup = BeautifulSoup(response, "lxml")
            info_list = soup.find('table',class_="towntable").find_all("tr",class_="towntr")
            for k,item in enumerate(info_list):

                if item.contents[0].find('a',{'href':True}):
                    #street_url = street_url_prefix + item.contents[0].a.attrs['href'].encode('utf8')
                    code = item.contents[0].a.get_text().encode('utf8')
                    name = item.contents[1].a.get_text().encode('utf8')
                    parent_code,parent_name = get_district_code(code)
                    level = 4
                    print code, name, parent_code, parent_name
                    insert_data(code, name, parent_code, parent_name, level)
           
    except Exception, e:
        print traceback.format_exc()
コード例 #8
0
def add_user():
    user_data = request.get_json()
    new_user = User(first_name=user_data['first_name'],
                    last_name=user_data['last_name'],
                    email=user_data['email'],
                    password=user_data['password'])
    db.insert_data(new_user)
    return "Connected to the data base and added new_user!"
コード例 #9
0
def add_new_stock():
    global conn
    data = {"stock_code": (input("请输入股票代码:\n")),
            "market": (input("请输入股票市场(沪A为1,深A为2):\n")),
            "cost_price": str(input("请输入成本价:\n")),
            "shares_held": str(input("请输入持有份额:\n"))}
    db.insert_data(conn, data)
    print(format.format_light_content("添加成功."))
コード例 #10
0
ファイル: server.py プロジェクト: toptech1989/youke
def queue(id, title):
    if not db.is_exist(id):
        player = kodi()
        db.insert_data({"youtube_id": id, "title": title, "status": "idle"})
        player.GUI.ShowNotification(title=title, message="Successfully Queued", displaytime=20000)
        return "Song Successfully Queued"
    else:
        return "Song already been in queued"
コード例 #11
0
def index():
    temp = pickle.dumps(np.array([123, 1234, 456]))
    temp_dict = {'data': temp}

    db.insert_data('faces', temp_dict)
    folders = os.listdir(
        '/home/praneet/Downloads/Compressed/images_data/Andrews College/')
    return render_template('index.html', folders=folders)
コード例 #12
0
def register():
    if request.method == 'GET':
        return render_template("register.html")
    else:
        email = request.form['email']
        pwd = request.form['pwd']
        print("전달된값:", email, pwd)
        db.insert_data(email, pwd)
        return '회원가입 데이터(POST)'
コード例 #13
0
ファイル: module.py プロジェクト: wangyu190810/python-skill
def count_worlds_at_url(url):
    resp = requests.get(url,timeout=2)
    create_log(log_name="url")
    data = {"url":url,""
                      "status_code":resp.status_code}
    insert_data(data)
    logging.info(resp.status_code)
    if resp.status_code != 200:
        return url
コード例 #14
0
ファイル: classes.py プロジェクト: adri-egea/agenda_python
 def anadir(self):
     print("---------------------")
     print("Añadir nuevo contacto")
     print("---------------------")
     nombre = input("Introduzca el nombre: ")
     apellidos = input("Introduzca el apellido: ")
     telefono = input("Introduzca el teléfono: ")
     email = input("Introduzca el email: ")
     #self.contactos.append({'nombre': nom, 'telf': telf, 'email': email})
     insert_data(nombre, apellidos, telefono, email)
コード例 #15
0
def saveData():

    # get values as json
    values = request.get_json()
    image_type = values.get('type')
    data = values.get('fields')

    db.insert_data(image_type, args_dict=data)

    return jsonify({'status': True})
コード例 #16
0
ファイル: generator.py プロジェクト: Airseai6/test
def write_data():
    table_name = 'data'
    db.create_tables(table_name, 0)

    data = gen_data()
    str_data = ''
    for i in data:
        str_data += '(' + str(i[0]) + ',\'' + i[1] + '\',' + str(
            i[2]) + ',' + str(i[3]) + '),'

    db.insert_data(table_name, str_data[:-1])
コード例 #17
0
ファイル: bot.py プロジェクト: amamonova/dsplabs_testcase
def message_processing(update, context):
    """
    Function saves voice messages in wav format with simple rate 16MHz and
    photos if a face is detected there. All path store in database `bot`.
    :return: None
    """

    logger.info(f'Waiting for message_processing function for '
                f'{update.effective_user.name} at '
                f'{update.effective_message.date}')

    user_id = update.effective_user.id

    conn, cursor = create_conn()

    if update.message.voice:
        filename, new_filename = download_file(update, context, 'voice')
        new_filename = f'{new_filename}.wav'

        convert(os.path.join(CONFIG['VOICE_FOLDER_PATH'], filename),
                new_filename)

        insert_data(conn, cursor, 'voice', user_id, 'audio_path', new_filename)

        answer_text = 'Thanks, I\'ve saved this voice message to my database.'

    elif update.message.photo:
        filename, new_filename = download_file(update, context, 'photo')
        new_filename = f'{new_filename}.jpg'

        PHOTO_FOLDER_PATH = CONFIG['PHOTO_FOLDER_PATH']

        if check_face(f'{PHOTO_FOLDER_PATH}/{user_id}_photo.jpg',
                      new_filename):
            insert_data(conn, cursor, 'photo', user_id, 'photo_path',
                        new_filename)
            answer_text = ('I saved this photo in the database because I\'ve '
                           'detected the face here.')
        else:
            answer_text = ('I didn\'t save this photo in my database, because '
                           'I haven\'t found the face here.')

    else:
        context.bot.send_sticker(chat_id=update.effective_chat.id,
                                 sticker=CONFIG['STICKER_PATH'])
        answer_text = 'Send me a voice message or a photo, please.'

    context.bot.send_message(parse_mode=ParseMode.MARKDOWN,
                             chat_id=update.effective_chat.id,
                             text=answer_text)
    close_conn(conn)

    logger.info(f'Answer ready for {update.effective_user.name} '
                f'at {update.effective_message.date}')
コード例 #18
0
ファイル: RegisterPeriod.py プロジェクト: jagritiS/PerFem
    def submit_data(self):
        cl = self.cl.get()
        pl = self.pl.get()

        print("check 1111========================")
        sql = "INSERT INTO period_details(user_id,cycle_length,period_length) VALUES (%s,%s,%s)"

        val = (reg_user, cl, pl)
        insert_data(sql, val)
        self.master.destroy()
        os.system('python3 HomePage.py')
        messagebox.showinfo("Details", "Details Added Successfully")
コード例 #19
0
def check_price(URL, desired_price):
    page = requests.get(URL, headers=headers)
    soup = BeautifulSoup(page.content, 'html.parser')

    product_name = soup.find(class_="productName").get_text()
    price = soup.select_one('div[class="Z19v"] strong').get_text(strip=True)

    if ("–" in price):
        price_clean = float(price.replace("–", "00"))
    else:
        price_clean = float(price)

    db.insert_data(product_name, price_clean, desired_price)
コード例 #20
0
ファイル: certificate.py プロジェクト: ttbug/certs-parser
    def run(self):
        #certs = self.get_certs_by_openssl(args.url, args.port)
        #certs = self.get_cert_by_scoket(args.url, args.port)
        certs = self.get_cert_by_ssl(args.url, args.port)

        result = self.parse_certs(certs)
        extresult = self.parse_cert_extension(certs)
        #print extresult
        cert_dict = {'_id': args.url, 'certs': certs}
        cert_dict.update(result)
        cert_dict.update(extresult)
        # insert to mongodb
        insert_data(cert_dict)
コード例 #21
0
ファイル: main.py プロジェクト: chendss/cloneWebPage
def insert_db(href, html):
    p = init_folder(href)
    c = CopyFactory(html, p, href).main()
    data = {
        'id': c.id,
        'path': c.path,
        "title": c.title,
        "cover": c.cover,
        "url": href,
        "description": c.description,
        'text': extract_html_text(c.soup),
    }
    insert_data('data', data)  # 插入数据到数据库
コード例 #22
0
ファイル: spider.py プロジェクト: Hotobun/weather
def get_weather(url):
    # 爬取数据 入库sql
    r = requests.get(url, headers={"user-agent": "mozilla/5.0"})
    print(r.url)
    t = etree.HTML(r.text)
    ul = t.xpath("//ul[@class='lishitable_content clearfix']")[0]
    for i in ul.xpath("./li")[:-1]:
        temp = i.xpath("./div/text()")
        if len(temp) == 4:
            date = i.xpath("./div/a/text()")[0]
            if date:
                temp = [date] + temp
        db.insert_data(temp)
コード例 #23
0
ファイル: bungoo.py プロジェクト: soy-curd/Bungoo
def download():
    link1 = "http://www.aozora.gr.jp/cards/000035/files/236_19996.html"
    link2 = "http://www.aozora.gr.jp/cards/000035/files/1572_19910.html"
    link3 = "http://www.aozora.gr.jp/cards/000035/files/1578_44923.html"

    srcs = [textdownload(x) + [x] for x in [
        link1,
        link2,
        link3,
    ]]

    db.make_table()
    for x in srcs:
        db.insert_data(*x)
コード例 #24
0
def load_ads(scraper, db_conn):
    ads = list()
    GROUP_SIZE = 10
    for adid in db.get_data(db_conn):
        a = scraper.get_ad(adid[0])
        if not (a):
            continue
        ads.append(a.to_tuple())
        if len(ads) == GROUP_SIZE:
            db.insert_data(conn=db_conn, data=ads)
            db.delete_data(conn=db_conn, ids=[(ad[0], ) for ad in ads])
            ads = list()
        time.sleep(1)
    db.insert_data(conn=db_conn, data=ads)
    db.delete_data(conn=db_conn, ids=[(ad[0], ) for ad in ads])
コード例 #25
0
ファイル: pystunden.py プロジェクト: martinfischer/pystunden
 def popup_neuer_eintrag(self):
     """
     Ruft das Popup Neuer Eintrag und ruft ein columntree Update.
     """
     popup = Toplevel(self)
     d = popup_neu.NeuerEintrag(popup)
     popup.update_idletasks()
     popup.update()
     popup.focus_set()
     popup.grab_set()
     popup.wait_window(popup)
     if d.result:
         db.insert_data(d.result)
     
     self.update_tabelle()
コード例 #26
0
def populate_db():
    for pick in tqdm(glob('dl/pickles/*.pickle')):
        for face in pickle(open(pick, 'rb').read()):
            data = {}
            data['folder_name'] = face['imagePath'].split('/')[-2]
            data['image_name'] = face['imagePath'].split('/')[-1]
            data['location'] = {
                'y1': face['loc'][0],
                'x2': face['loc'][1],
                'y2': face['loc'][2],
                'x1': face['loc'][3]
            }
            data['encoding'] = pickle.dumps(face['encoding'])
            data['tagged'] = False
            db.insert_data('faces', temp_dict)
コード例 #27
0
ファイル: main.py プロジェクト: srthurman/load_dynamodb
def run():
    ''' Entry point for the application '''
    place_table = 'Places'
    mapzen_search = {
        'url': "https://search.mapzen.com/v1/nearby",
        'params': {
            'layers': 'venue',
            'key': os.environ.get("MAPZEN_API_KEY"),
            'size': '30',
            'point.lon': '-122.44674682617188',
            'point.lat': '37.75280111220671',
            'boundary.circle.radius': '12',
            'sources': 'gn'
        }
    }
    mapzen_json = fetch_data(mapzen_search)
    mapzen_data_to_load = transform_data(mapzen_json['features'])
    insert_data(place_table, mapzen_data_to_load)
コード例 #28
0
ファイル: crawler.py プロジェクト: linkJX/json
    def run(self):
        """
        多线程运行的主程序
        """

        if self.field is None:
            self.field = []

        # 初始化单条数据对象,循环里用于插入数据库
        single_data = {}

        while True:

            # 从队列中取出一个请求
            req = self.que.get_nowait()

            # 用于判断队列是否为空,空则退出
            if req is None:
                break

            # 间隔0.5s执行爬取的请求,并解析出json数据集
            time.sleep(0.5)
            init_content = self.s.send(req)
            content = json.loads(init_content.content)
            logger.info(req.url)

            # 筛选需要的字段
            index = parse(self.field[0], content)[0]
            for i in content[index]:
                try:
                    for f in self.field:
                        single_data[f] = i[f]
                except TypeError:
                    logger.error("请传入列表格式的field")
                    break
                else:
                    insert_data(single_data, info=self.info)
                    # logger.info(single_data)
                    single_data = {}

            # 给队列添加空值,方便退出
            self.que.put(None)
        return None
コード例 #29
0
def crawl_city_detail(url):

    response = request_util(url, 'gb2312')
    try:
        soup = BeautifulSoup(response, "lxml")
        info_list = soup.find('table',
                              class_="citytable").find_all("tr",
                                                           class_="citytr")

        for k, item in enumerate(info_list):

            code = item.contents[0].a.get_text().encode('utf8')
            name = item.contents[1].a.get_text().encode('utf8')
            parent_code, parent_name = get_province_code(code)
            level = 2
            insert_data(code, name, parent_code, parent_name, level)

    except Exception, e:
        print traceback.format_exc()
コード例 #30
0
ファイル: scrapped_sites.py プロジェクト: Atheem/E_reputation
def jamaalfna(url):
    source = requests.get(url).text
    soup = bs(source, 'lxml')
    title = soup.find('h1', {"class": "entry-title"})
    date = soup.find('time', {"class": "entry-date"})
    content = soup.find('div', {"class": "entry-content"})
    content_p = content.findAll('p')
    media = soup.find('img', {"class": "wp-post-image"})
    article = "\n"
    for p in content_p:
        article += p.text
    data = {
        'Title': title.text,
        'Date': date.text,
        'Author': 'None',
        'Content': article,
        'Image': media['src']
    }
    db.insert_data(data)
コード例 #31
0
ファイル: scrapped_sites.py プロジェクト: Atheem/E_reputation
def kech24(url):
    source = requests.get(url).text
    soup = bs(source, 'lxml')
    title = soup.find('h2', {"itemprop": "name headline"})
    date = soup.find('time', {"itemprop": "dateCreated datePublished"})
    content = soup.find('div', {"itemprop": "text"})
    content_p = content.findAll('p')
    media = soup.find('img', {"itemprop": "image"})
    autor = soup.find('span', {"itemprop": "author"})
    article = "\n"
    for p in content_p:
        article += p.text
    data = {
        'Title': title.text,
        'Date': date.text,
        'Author': autor.txt,
        'Content': article,
        'Image': media['src']
    }
    db.insert_data(data)
コード例 #32
0
ファイル: scrapped_sites.py プロジェクト: Atheem/E_reputation
def alayam24(url):
    source = requests.get(url).text
    soup = bs(source, 'lxml')
    title = soup.find('h1', {"class": "heading"})
    date = soup.find('span', {"class": "date"})
    content = soup.find('div', {"class": "articlecontent"})
    content_p = content.findAll('p')
    media = soup.find('img', {"class": "lazy"})
    autor = soup.find('span', {"class": "writer"})
    article = "\n"
    for p in content_p:
        article += p.text
    data = {
        'Title': title.text,
        'Date': date.text,
        'Author': autor.text,
        'Content': article,
        'Image': media['data-original']
    }
    db.insert_data(data)