コード例 #1
0
def get_all_links():
    sheet = get_sheet('godness', 'links')
    for url in normal_urls:
        sheet.insert_one({'url': url})
        print(url)
        time.sleep(0.5)
        for i in range(2, 50):
            index_url = url + 'index_' + str(i) + '.html'
            web_data = requests.get(index_url)
            soup = BeautifulSoup(web_data.text, 'lxml')
            if soup.find('a'):
                print(index_url)
                sheet.insert_one({'url': index_url})
            else:
                break
コード例 #2
0
#! /usr/bin/env python
# -*- coding: utf-8 -*-


import database
import requests
from bs4 import BeautifulSoup
import time
import pprint

links = database.get_sheet('godness', 'links')
actors_info = database.get_sheet('godness', 'actors_info')


def get_item_info(url):
    web_data = ''
    while web_data == '':
        try:
            web_data = requests.get(url)
        except:
            print('web_site is tired, let us sleep for 5 second')
            print('zzZZ')
            time.sleep(5)
            continue
    soup = BeautifulSoup(web_data.content, 'lxml')
    name = soup.find('h1').text if soup.find('h1') else None
    print(name+':'+url)
    numbers = soup.select('b') if soup.select('b') else None
    dates = soup.select('span > date:nth-of-type(2)') if soup.select('span > date:nth-of-type(2)') else None
    titles = soup.select('div > div > span > p') if soup.select('div > div > span > p') else None
    for number, date, title in zip(numbers, dates, titles):
コード例 #3
0
#             datas = list(singer_list.find({}, {'_id': 0, 'Fsinger_mid': 1}).limit(100))
#             if len(datas) > 0:
#                 print('if start')
#                 time.sleep(2)
#                 for data in datas:
#                     data_list.append(data['Fsinger_mid'])
#                 pool.map(get_music_info.get_song_album_id, data_list)
#             else:
#                 break
#             count = count + 100
#         except:
#             continue


# get all song infomation from song and album ids
if __name__ == '__main__':
    song_album_id_list = database.get_sheet('QQMusic', 'song_album_id_list')
    pool = Pool(4)
    count = 0
    while True:
        try:
            print('crawled songs : ', count)
            # get 100 datas by once
            datas = list(song_album_id_list.find({}, {'_id': 0}).limit(100))
            if len(datas) > 0:
                pool.map(get_music_info.get_item_info, datas)
            else:
                break
            count = count + 100
        except:
            continue
コード例 #4
0
#! /usr/bin/env python
# -*- coding: utf-8 -*-

import requests
import json
import database
import time

singer_list = database.get_sheet('QQMusic', 'singer_list')
headers = {
    'user-agent':
    'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/'
    '41.0.2272.118 Safari/537.36',
    'referer':
    'https://y.qq.com/portal/singer_list.html'
}

url = 'https://c.y.qq.com/v8/fcg-bin/v8.fcg?channel=singer&page=list&key=all_all_all&pagesize=100&pagenum={}&g_tk=1445151743&jsonpCallback=GetSingerListCallback&loginUin=707813012&hostUin=0&format=jsonp&inCharset=utf8&outCharset=utf-8&notice=0&platform=yqq&needNewCode=0'
urls = [url.format(i) for i in range(1, 5527)]


# get all singers in one page and insert into database
def get_singers(url):
    print('crawling url : ' + url)
    web_data = ''
    while web_data == '':
        try:
            web_data = requests.get(url, headers=headers)
        except:
            print('bad network, let us sleep for 5 seconds')
            print('uncrawled url : ' + url)
コード例 #5
0
#! /usr/bin/env python
# -*- coding: utf-8 -*-
import requests
import json
import database
import time

song_album_id_list = database.get_sheet('QQMusic', 'song_album_id_list')
delete_singer = database.get_sheet('QQMusic', 'singer_list')
song_item_info = database.get_sheet('QQMusic', 'song_item_info')


# get all songs from one singer and insert into database
def get_song_album_id(Fsinger_mid):
    print(Fsinger_mid)
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
        'referer': 'https://y.qq.com/n/yqq/singer/{}.html'.format(Fsinger_mid)
    }
    begin = 0
    all_songs = []
    while True:
        # one page url with 30 songs or less
        url = 'https://c.y.qq.com/v8/fcg-bin/fcg_v8_singer_track_cp.fcg?g_tk=1964444483&jsonpCallback=MusicJsonCallbacksinger_track&loginUin=707813012&hostUin=0&format=jsonp&inCharset=utf8&outCharset=utf-8&notice=0&platform=yqq&needNewCode=0&singermid={}&order=listen&begin={}&num=30&songstatus=1'.format(Fsinger_mid, begin)
        web_data = ''
        while web_data == '':
            try:
                web_data = requests.get(url, headers=headers)
            except:
                print('bad network, let us sleep for 5 seconds')
                time.sleep(5)
コード例 #6
0
ファイル: trim_base.py プロジェクト: AllenDown/godness
#! /usr/bin/env python
# -*- coding: utf-8 -*-

import database
import pprint

# database.delete('walden', 'sheet', {'index': 107})

sheet = database.get_sheet('walden', 'sheet')
sheet_backup = database.get_sheet('walden', 'new_sheet')

sheet_backup.delete_one({})
コード例 #7
0
ファイル: report.py プロジェクト: taylorhickem/td-projects
def load_config():
    ''' loads gsheet user interface config
    '''
    global UI_CONFIG
    config_tbl = db.get_sheet(UI_SHEET, 'config')
    UI_CONFIG = get_reporting_config(config_tbl)