def __init__(self, url): self.cnx = databaseConfig.dbconn("") self.url = url self.headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36', 'referer': 'http://www.asahi.com/information/service/rss.html' }
def __init__(self, url): self.cnx = databaseConfig.dbconn("") self.url = url self.headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36', 'referer': 'https://techcrunch.com' }
def __init__(self, url): # super(appleDailyTw, self).__init__() self.cnx = databaseConfig.dbconn("") self.url = url # self.createdDate = createdDate self.headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36', 'referer': 'http://www.appledaily.com.tw/' }
def __init__(self, platform_id, token, title, content, href, created_time, updated_time): super(migration, self).__init__() self.cnx = databaseConfig.dbconn("") self.platform_id = platform_id self.token = token self.title = title self.content = content self.href = href self.created_time = created_time self.updated_time = updated_time if self.token is None: self.token = hashlib.sha256(self.href).hexdigest()
def __init__(self, url): # super(pingTester, self).__init__() self.url = url self.count = 1 self.cnx = databaseConfig.dbconn("") self.domain = "http://www.aastocks.com" # self.t = int(time.time()) # self.condition = self.t - 3600 ##Test # self.t = datetime.strptime("2017-07-13 05:00", "%Y-%m-%d %H:00") # self.condition = time.mktime(self.t.timetuple()) - 3600 self.t = int(time.time()) self.condition = self.t - 3600
def call(self, token, content): ''' Key Word List :return: ''' cnx = databaseConfig.dbconn("") cursor = cnx.cursor() query = "SELECT tag FROM crawler.feeds where token=%s;" cursor.execute(query, (token, )) results = cursor.fetchall() for i in results: print str(i) tag = ["None"] if results is not None: return tag keyWord = ["性侵", "裸體", "新北市"] url = "https://language.googleapis.com/v1/documents:analyzeEntities?key=AIzaSyApD2KOIDycoDrmvsrzp6BrasIIRCDNawQ" data = { 'encodingType': 'UTF8', 'document': { 'type': 'PLAIN_TEXT', 'content': content } } json_data = json.dumps(data) response = requests.post(url, json_data) responseJson = response.json() tag = [] for i in keyWord: if i in content: tag.append(i) for i in responseJson["entities"]: if len(tag) > 10: continue if len(i["name"]) > 30 or i["name"] in tag or len(i["name"]) < 1: continue if i["salience"] > 0.03: tag.append(i['name']) if i["type"] == "LOCATION" and len(i["name"]) > 1: tag.append(i['name']) tagJson = json.dumps(tag, encoding='UTF-8', ensure_ascii=False) print tagJson
from datetime import datetime from bs4 import BeautifulSoup from tabulate import tabulate import re import hashlib import sys import databaseConfig from contextlib import closing import random import urllib2 import json reload(sys) sys.setdefaultencoding('utf8') cnx = databaseConfig.dbconn("") ##threading.Thread class migration(threading.Thread): def __init__(self, platform_id, token, title, content, href, created_time, updated_time): super(migration, self).__init__() self.cnx = databaseConfig.dbconn("") self.platform_id = platform_id self.token = token self.title = title self.content = content self.href = href self.created_time = created_time self.updated_time = updated_time
def run(self): headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36', 'referer': 'https://www.daydaycook.com' } cnx = databaseConfig.dbconn("") for a in range(122, 123): display = Display(visible=0, size=(800, 600)) display.start() browser = webdriver.Firefox() # browser.get('https://www.towngascooking.com/tc/Recipe/') browser.get( 'http://www.daydaycook.com/daydaycook/hk/website/recipe/list.do' ) #It can go to other page but wrong page # element = browser.find_element_by_id("currentPage") # element.send_keys(3) # browser.find_element_by_id("pageListForm").submit() # browser.find_element_by_class_name("dinggou").click() ## # browser.execute_script('return $(\"(.pagination a\").eq(3).click();') time.sleep(5) # browser.execute_script("getList(4);") browser.execute_script("pageSkip(" + str(a) + ");") time.sleep(1) # handles = set(browser.window_handles) # handles.remove(browser.current_window_handle) # browser.switch_to_window(handles.pop()) htmlCode = browser.page_source browser.close() display.popen.terminate() soup = BeautifulSoup(htmlCode, "lxml") items = soup.find("div", {"class": "resultList justify three"}) for item in items.findAll("div", {"class": "box"}): href = item.a.get("href") token = hashlib.sha256(href).hexdigest() r = requests.get(href, headers=headers) soup = BeautifulSoup(r.content, "lxml") title = soup.find("div", {"class": "title"}).b.text print title # ingredients Start ingredients = soup.find("div", { "class": "table" }).table.find_all("tr") tag = {} for tr in ingredients: _ingredients = tr.find_all('td')[0].text _grading = re.sub('[\s+]', '', tr.find_all('td')[1].text) tag[_ingredients] = _grading ingredientJson = json.dumps(tag, encoding='UTF-8', ensure_ascii=False) # ingredients End ##Step start stepList = soup.find("div", { "class": "stepList" }).find_all("div", {"class": "list justify"}) stepNo = 0 stepArr = [] stepImgArr = [] for _step in stepList: stepImgArr.append(_step.img.get("data-src")) stepArr.append(_step.pre.text) stepZip = zip(stepArr, stepImgArr) stepJson = json.dumps(stepZip, encoding='UTF-8', ensure_ascii=False) ##Step end ##Time cookingTime = soup.find("div", { "class": "timeLen" }).find_all("span")[1].text try: with closing(cnx.cursor()) as cursor: cursor.execute( "INSERT INTO recipes(`token`,`title`,`ingredients`,`cooking_time`,`display_content`,`source`,`article_url`,`created_time`,`updated_time`) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s)ON DUPLICATE KEY UPDATE `title` = %s, `updated_time` = %s", ( token, title, ingredientJson, cookingTime, stepJson, 'dayCook', href, int(time.time()), int(time.time()), title, int(time.time()), )) cnx.commit() except TypeError as e: print(e) print "error" try: with closing(cnx.cursor()) as cursor: cursor.execute( "INSERT INTO recipes_details(`token`,`content`,`created_time`,`updated_time`) VALUES (%s,%s,%s,%s)ON DUPLICATE KEY UPDATE `updated_time` = %s", ( token, r.content, int(time.time()), int(time.time()), int(time.time()), )) cnx.commit() except TypeError as e: print(e) print "error" time.sleep(2) cnx.close()
def __init__(self, country): super(bloomberg, self).__init__() self.cnx = databaseConfig.dbconn("") self.country = country