Ejemplo n.º 1
0
    def parse_page(self, response):
        self.log('Fetch FxNews page: %s' % response.url)
        item = NewscrawlItem()
        item['title'] = response.xpath('/html/body/div[7]/div[1]/div[2]/text()'
                                       ).extract()[0].encode('utf-8')
        item['URL'] = response.url
        time_common = response.xpath(
            '/html/body/div[7]/div[1]/div[3]/span[1]/text()').extract(
            )[0].replace(u'\u5e74', u'-').replace(u'\u6708', u'-').replace(
                u'\u65e5', '').encode('utf-8') + ':00'
        item['time'] = int(
            time.mktime(time.strptime(time_common, '%Y-%m-%d %H:%M:%S')))
        item['content'] = response.xpath(
            '/html/body/div[7]/div[1]/div[4]/div[1]/p[1]').extract()[0].encode(
                'utf-8')
        item['tags'] = ','.join(
            response.xpath(
                '/html/body/div[7]/div[1]/div[4]/div[1]/div[2]/div[2]/ul/li/a/text()'
            ).extract()).encode('utf-8')
        [
            item['JPY_news'], item['JPY_norm'], item['JPY_analy'],
            item['CHF_news'], item['CHF_norm'], item['CHF_analy'],
            item['USD_news'], item['USD_norm'], item['USD_analy'],
            item['EUR_news'], item['EUR_norm'], item['EUR_analy'],
            item['GBP_news'], item['GBP_norm'], item['GBP_analy'],
            item['AUD_news'], item['AUD_norm'], item['AUD_analy'],
            item['CAD_news'], item['CAD_norm'], item['CAD_analy'],
            item['RMB_news'], item['RMB_norm'], item['RMB_analy'],
            item['gold'], item['silver'], item['crude'], item['bond']
        ] = classifier(item['title'], item['tags'], kinds_flag)
        item['importance'] = 0

        return item
Ejemplo n.º 2
0
def process():
    total, results = scrape()
    file_read = open('./TOTAL', 'rb')
    TOTAL_PREVIOUS = int(file_read.read().strip())
    file_read.close()

    if detect(total, TOTAL_PREVIOUS):
        file_write = open('./TOTAL', 'wb')
        file_write.write(str(total))
        file_write.close()
        conn = MySQLdb.connect(host="localhost",
                               user="******",
                               passwd="123456",
                               db="newsPool",
                               charset='utf8',
                               use_unicode=True)
        cursor = conn.cursor()

        for news in results:
            Time = str(int(time.time()))
            keys_list = [
                'id', 'time', 'title', 'content', 'JPY_news', 'JPY_norm',
                'JPY_analy', 'CHF_news', 'CHF_norm', 'CHF_analy', 'USD_news',
                'USD_norm', 'USD_analy', 'EUR_news', 'EUR_norm', 'EUR_analy',
                'GBP_news', 'GBP_norm', 'GBP_analy', 'AUD_news', 'AUD_norm',
                'AUD_analy', 'CAD_news', 'CAD_norm', 'CAD_analy', 'RMB_news',
                'RMB_norm', 'RMB_analy', 'gold', 'silver', 'crude', 'bond',
                'importance'
            ]
            keys = ','.join(keys_list)

            values_str_list_tmp = [
                news[u'id'], Time, news[u'title'], news[u'contentHtml']
            ]
            value_str_list = [
                "\"" + x.encode('utf-8').replace("\"", "\\\"") + "\""
                for x in values_str_list_tmp
            ]
            kinds_list_tmp = classifier(news[u'title'].encode('utf-8'), '',
                                        kinds_flag) + [0]
            kinds_list = [str(y) for y in kinds_list_tmp]
            value_list = value_str_list + kinds_list
            values = ','.join(value_list)

            sql = "insert into wallstreet_realtime_news (%s) values(%s)" % (
                keys, values)
            try:
                cursor.execute(sql)
            except MySQLdb.IntegrityError:
                pass
        cursor.close()
        conn.commit()
        conn.close()
        print "update has been appended"
    else:
        print "there is no update"
Ejemplo n.º 3
0
    def parse_page(self, response):
        self.log("Fetch FxNews page: %s" % response.url)
        item = NewscrawlItem()
        item["title"] = response.xpath("/html/body/div[7]/div[1]/div[2]/text()").extract()[0].encode("utf-8")
        item["URL"] = response.url
        time_common = (
            response.xpath("/html/body/div[7]/div[1]/div[3]/span[1]/text()")
            .extract()[0]
            .replace(u"\u5e74", u"-")
            .replace(u"\u6708", u"-")
            .replace(u"\u65e5", "")
            .encode("utf-8")
            + ":00"
        )
        item["time"] = int(time.mktime(time.strptime(time_common, "%Y-%m-%d %H:%M:%S")))
        item["content"] = response.xpath("/html/body/div[7]/div[1]/div[4]/div[1]/p[1]").extract()[0].encode("utf-8")
        item["tags"] = ",".join(
            response.xpath("/html/body/div[7]/div[1]/div[4]/div[1]/div[2]/div[2]/ul/li/a/text()").extract()
        ).encode("utf-8")
        [
            item["JPY_news"],
            item["JPY_norm"],
            item["JPY_analy"],
            item["CHF_news"],
            item["CHF_norm"],
            item["CHF_analy"],
            item["USD_news"],
            item["USD_norm"],
            item["USD_analy"],
            item["EUR_news"],
            item["EUR_norm"],
            item["EUR_analy"],
            item["GBP_news"],
            item["GBP_norm"],
            item["GBP_analy"],
            item["AUD_news"],
            item["AUD_norm"],
            item["AUD_analy"],
            item["CAD_news"],
            item["CAD_norm"],
            item["CAD_analy"],
            item["RMB_news"],
            item["RMB_norm"],
            item["RMB_analy"],
            item["gold"],
            item["silver"],
            item["crude"],
            item["bond"],
        ] = classifier(item["title"], item["tags"], kinds_flag)
        item["importance"] = 0

        return item
Ejemplo n.º 4
0
def process():
    total, results = scrape()
    file_read = open('./TOTAL', 'rb')
    TOTAL_PREVIOUS = int(file_read.read().strip())
    file_read.close()

    if detect(total, TOTAL_PREVIOUS):
        file_write = open('./TOTAL', 'wb')
        file_write.write(str(total))
        file_write.close()
        conn = MySQLdb.connect(host="localhost",user="******", passwd="123456",
                             db="newsPool", charset='utf8', use_unicode=True)
        cursor = conn.cursor()

        for news in results:
            Time = str(int(time.time()))
            keys_list = ['id','time','title','content',
                        'JPY_news','JPY_norm','JPY_analy','CHF_news','CHF_norm','CHF_analy','USD_news','USD_norm','USD_analy',
                        'EUR_news','EUR_norm','EUR_analy','GBP_news','GBP_norm','GBP_analy','AUD_news','AUD_norm','AUD_analy',
                        'CAD_news','CAD_norm','CAD_analy','RMB_news','RMB_norm','RMB_analy','gold','silver','crude','bond','importance']
            keys = ','.join(keys_list)

            values_str_list_tmp = [news[u'id'], Time, news[u'title'], news[u'contentHtml']]
            value_str_list = ["\""+x.encode('utf-8').replace("\"", "\\\"")+"\"" for x in values_str_list_tmp]
            kinds_list_tmp = classifier(news[u'title'].encode('utf-8'), '', kinds_flag) + [0]
            kinds_list = [str(y) for y in kinds_list_tmp]
            value_list = value_str_list + kinds_list
            values = ','.join(value_list)

            sql = "insert into wallstreet_realtime_news (%s) values(%s)" % (keys, values)
            try:
                cursor.execute(sql)
            except MySQLdb.IntegrityError:
                pass
        cursor.close()
        conn.commit()
        conn.close()
        print "update has been appended"
    else:
        print "there is no update"