def get_specific_element(element, text): """ get specific info from the text :param element: the info to be crawled :param text: the html source :return: the value of the info """ conf = ConfigureParser('./__configure__/configure.xml') [choice, pattern] = conf.get_configure_by_tag_name(element) return get_res(choice, pattern, text)
def jd_search(keys): conf = ConfigureParser('./__configure__/configure.xml') output_file = conf.get_configure_by_tag_name('output_file') xpath = conf.get_configure_by_tag_name('xpath') for key in keys: source = get_html(handle_type(key)) try: sel = Selector(text=source).xpath(xpath) num = str(num_trans(sel.extract()[0])) except: num = '0' print handle_type(key), num with open(output_file, 'ab') as f: f.write(key + '\t' + num + '\n')
def get_keys(): conf = ConfigureParser('./__configure__/configure.xml') input_file = conf.get_configure_by_tag_name('input_file') output_file = conf.get_configure_by_tag_name('output_file') if not os.path.exists(output_file): with open(input_file, 'rb') as f: keys = [each.strip() for each in f.readlines()] return keys else: with open(input_file, 'rb') as f1: with open(output_file, 'rb') as f2: keys1 = [each.strip() for each in f1.readlines()] keys2 = [ each.strip().rsplit('\t', 1)[0] for each in f2.readlines() ] keys = [each for each in keys1 if each not in keys2] return keys
import time import random from __spider__.doubanJudge import DoubanJudge from __configure__.ConfigureParser import ConfigureParser if __name__ == '__main__': conf = ConfigureParser('./__configure__/configure.xml') input_file = conf.get_configure_by_tag_name_simple('input_file') output_file = conf.get_configure_by_tag_name_simple('output_file') with open(input_file, 'r') as f: for each in f: key = each.rsplit('\t', 1)[0].strip() crawler = DoubanJudge(unicode(key), output_file) crawler.handle() time.sleep(random.randint(3, 5))
#!/usr/bin/python # -*-coding:utf-8-*- import urllib2 import json import requests import datetime import re import sys from __util__.library import handle_data, mail_notification, TransException, my_retry from __configure__.ConfigureParser import ConfigureParser reload(sys) sys.setdefaultencoding('utf8') Configure = ConfigureParser('./__configure__/configure.xml') admin = Configure.get_configure_by_tag_name('admin') def get_now_time(): """ to get the last date when there is a music rank list on QQ music the rank list may update late, so give a timedelta 4 days. :return: a string of date such as: 2016-8-20 """ now = datetime.datetime.now() - datetime.timedelta(3) return now.strftime('%Y-%m-%d') def get_current_nums(): """ get the current number of the rank list :return: a string of year and the number such as: 2016_37, presenting the 37th rank list of 2016
# -*-coding:utf-8-*- import datetime import time from __util__.library import create_post_urls, mail_notification from __configure__.ConfigureParser import ConfigureParser from __spider__.spidermovie import handle_movie from __spider__.spidermusic import handle_music from __spider__.spiderseries import handle_series from __spider__.netEaseMusic import handle_netease_music Configure = ConfigureParser('./__configure__/configure.xml') # get interval time from the configure file interval = int(eval(Configure.get_configure_by_tag_name('update_duration'))) def manage(func): """ a decorator to print executing info and set sleep time :param func: the function being executed :return: an auxiliary function """ def wraps(*arg, **args): while 1: print "Start executing at {}".format(datetime.datetime.now()) func(*arg, **args) print "End executing at {}".format(datetime.datetime.now()) time.sleep(interval) return wraps @manage