Esempio n. 1
0
 def __init__(self, url):
     self.cnx = databaseConfig.dbconn("")
     self.url = url
     self.headers = {
         'User-Agent':
         'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
         'referer': 'http://www.asahi.com/information/service/rss.html'
     }
Esempio n. 2
0
 def __init__(self, url):
     self.cnx = databaseConfig.dbconn("")
     self.url = url
     self.headers = {
         'User-Agent':
         'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
         'referer': 'https://techcrunch.com'
     }
Esempio n. 3
0
 def __init__(self, url):
     # super(appleDailyTw, self).__init__()
     self.cnx = databaseConfig.dbconn("")
     self.url = url
     # self.createdDate = createdDate
     self.headers = {
         'User-Agent':
         'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
         'referer': 'http://www.appledaily.com.tw/'
     }
Esempio n. 4
0
 def __init__(self, platform_id, token, title, content, href, created_time,
              updated_time):
     super(migration, self).__init__()
     self.cnx = databaseConfig.dbconn("")
     self.platform_id = platform_id
     self.token = token
     self.title = title
     self.content = content
     self.href = href
     self.created_time = created_time
     self.updated_time = updated_time
     if self.token is None:
         self.token = hashlib.sha256(self.href).hexdigest()
Esempio n. 5
0
 def __init__(self, url):
     # super(pingTester, self).__init__()
     self.url = url
     self.count = 1
     self.cnx = databaseConfig.dbconn("")
     self.domain = "http://www.aastocks.com"
     # self.t = int(time.time())
     # self.condition = self.t - 3600
     ##Test
    # self.t =  datetime.strptime("2017-07-13 05:00", "%Y-%m-%d %H:00")
    # self.condition = time.mktime(self.t.timetuple()) - 3600
     self.t = int(time.time())
     self.condition = self.t - 3600
Esempio n. 6
0
def call(self, token, content):
    '''
        Key Word List
        :return:
        '''
    cnx = databaseConfig.dbconn("")
    cursor = cnx.cursor()
    query = "SELECT tag FROM crawler.feeds where token=%s;"
    cursor.execute(query, (token, ))
    results = cursor.fetchall()
    for i in results:
        print str(i)
    tag = ["None"]
    if results is not None:
        return tag

    keyWord = ["性侵", "裸體", "新北市"]

    url = "https://language.googleapis.com/v1/documents:analyzeEntities?key=AIzaSyApD2KOIDycoDrmvsrzp6BrasIIRCDNawQ"
    data = {
        'encodingType': 'UTF8',
        'document': {
            'type': 'PLAIN_TEXT',
            'content': content
        }
    }

    json_data = json.dumps(data)

    response = requests.post(url, json_data)
    responseJson = response.json()
    tag = []
    for i in keyWord:
        if i in content:
            tag.append(i)

    for i in responseJson["entities"]:
        if len(tag) > 10:
            continue
        if len(i["name"]) > 30 or i["name"] in tag or len(i["name"]) < 1:
            continue
        if i["salience"] > 0.03:
            tag.append(i['name'])
        if i["type"] == "LOCATION" and len(i["name"]) > 1:
            tag.append(i['name'])

    tagJson = json.dumps(tag, encoding='UTF-8', ensure_ascii=False)
    print tagJson
Esempio n. 7
0
from datetime import datetime
from bs4 import BeautifulSoup
from tabulate import tabulate
import re
import hashlib
import sys
import databaseConfig
from contextlib import closing
import random
import urllib2
import json

reload(sys)
sys.setdefaultencoding('utf8')

cnx = databaseConfig.dbconn("")


##threading.Thread
class migration(threading.Thread):
    def __init__(self, platform_id, token, title, content, href, created_time,
                 updated_time):
        super(migration, self).__init__()
        self.cnx = databaseConfig.dbconn("")
        self.platform_id = platform_id
        self.token = token
        self.title = title
        self.content = content
        self.href = href
        self.created_time = created_time
        self.updated_time = updated_time
Esempio n. 8
0
    def run(self):
        headers = {
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
            'referer': 'https://www.daydaycook.com'
        }
        cnx = databaseConfig.dbconn("")
        for a in range(122, 123):
            display = Display(visible=0, size=(800, 600))
            display.start()

            browser = webdriver.Firefox()
            # browser.get('https://www.towngascooking.com/tc/Recipe/')
            browser.get(
                'http://www.daydaycook.com/daydaycook/hk/website/recipe/list.do'
            )
            #It can go to other page but wrong page
            # element = browser.find_element_by_id("currentPage")
            # element.send_keys(3)
            # browser.find_element_by_id("pageListForm").submit()
            # browser.find_element_by_class_name("dinggou").click()
            ##
            # browser.execute_script('return $(\"(.pagination a\").eq(3).click();')
            time.sleep(5)
            # browser.execute_script("getList(4);")
            browser.execute_script("pageSkip(" + str(a) + ");")
            time.sleep(1)
            # handles = set(browser.window_handles)
            # handles.remove(browser.current_window_handle)
            # browser.switch_to_window(handles.pop())
            htmlCode = browser.page_source
            browser.close()
            display.popen.terminate()

            soup = BeautifulSoup(htmlCode, "lxml")
            items = soup.find("div", {"class": "resultList justify three"})
            for item in items.findAll("div", {"class": "box"}):
                href = item.a.get("href")
                token = hashlib.sha256(href).hexdigest()
                r = requests.get(href, headers=headers)
                soup = BeautifulSoup(r.content, "lxml")
                title = soup.find("div", {"class": "title"}).b.text
                print title
                # ingredients Start
                ingredients = soup.find("div", {
                    "class": "table"
                }).table.find_all("tr")
                tag = {}
                for tr in ingredients:
                    _ingredients = tr.find_all('td')[0].text
                    _grading = re.sub('[\s+]', '', tr.find_all('td')[1].text)
                    tag[_ingredients] = _grading
                ingredientJson = json.dumps(tag,
                                            encoding='UTF-8',
                                            ensure_ascii=False)
                # ingredients End

                ##Step start
                stepList = soup.find("div", {
                    "class": "stepList"
                }).find_all("div", {"class": "list justify"})
                stepNo = 0
                stepArr = []
                stepImgArr = []
                for _step in stepList:
                    stepImgArr.append(_step.img.get("data-src"))
                    stepArr.append(_step.pre.text)
                stepZip = zip(stepArr, stepImgArr)
                stepJson = json.dumps(stepZip,
                                      encoding='UTF-8',
                                      ensure_ascii=False)
                ##Step end
                ##Time
                cookingTime = soup.find("div", {
                    "class": "timeLen"
                }).find_all("span")[1].text

                try:
                    with closing(cnx.cursor()) as cursor:
                        cursor.execute(
                            "INSERT INTO recipes(`token`,`title`,`ingredients`,`cooking_time`,`display_content`,`source`,`article_url`,`created_time`,`updated_time`) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s)ON DUPLICATE KEY UPDATE `title` = %s, `updated_time` = %s",
                            (
                                token,
                                title,
                                ingredientJson,
                                cookingTime,
                                stepJson,
                                'dayCook',
                                href,
                                int(time.time()),
                                int(time.time()),
                                title,
                                int(time.time()),
                            ))
                    cnx.commit()
                except TypeError as e:
                    print(e)
                    print "error"
                try:
                    with closing(cnx.cursor()) as cursor:
                        cursor.execute(
                            "INSERT INTO recipes_details(`token`,`content`,`created_time`,`updated_time`) VALUES (%s,%s,%s,%s)ON DUPLICATE KEY UPDATE `updated_time` = %s",
                            (
                                token,
                                r.content,
                                int(time.time()),
                                int(time.time()),
                                int(time.time()),
                            ))
                    cnx.commit()
                except TypeError as e:
                    print(e)
                    print "error"
                time.sleep(2)

        cnx.close()
Esempio n. 9
0
 def __init__(self, country):
     super(bloomberg, self).__init__()
     self.cnx = databaseConfig.dbconn("")
     self.country = country