def parse_page(self, response): self.log('Fetch FxNews page: %s' % response.url) item = NewscrawlItem() item['title'] = response.xpath('/html/body/div[7]/div[1]/div[2]/text()' ).extract()[0].encode('utf-8') item['URL'] = response.url time_common = response.xpath( '/html/body/div[7]/div[1]/div[3]/span[1]/text()').extract( )[0].replace(u'\u5e74', u'-').replace(u'\u6708', u'-').replace( u'\u65e5', '').encode('utf-8') + ':00' item['time'] = int( time.mktime(time.strptime(time_common, '%Y-%m-%d %H:%M:%S'))) item['content'] = response.xpath( '/html/body/div[7]/div[1]/div[4]/div[1]/p[1]').extract()[0].encode( 'utf-8') item['tags'] = ','.join( response.xpath( '/html/body/div[7]/div[1]/div[4]/div[1]/div[2]/div[2]/ul/li/a/text()' ).extract()).encode('utf-8') [ item['JPY_news'], item['JPY_norm'], item['JPY_analy'], item['CHF_news'], item['CHF_norm'], item['CHF_analy'], item['USD_news'], item['USD_norm'], item['USD_analy'], item['EUR_news'], item['EUR_norm'], item['EUR_analy'], item['GBP_news'], item['GBP_norm'], item['GBP_analy'], item['AUD_news'], item['AUD_norm'], item['AUD_analy'], item['CAD_news'], item['CAD_norm'], item['CAD_analy'], item['RMB_news'], item['RMB_norm'], item['RMB_analy'], item['gold'], item['silver'], item['crude'], item['bond'] ] = classifier(item['title'], item['tags'], kinds_flag) item['importance'] = 0 return item
def process(): total, results = scrape() file_read = open('./TOTAL', 'rb') TOTAL_PREVIOUS = int(file_read.read().strip()) file_read.close() if detect(total, TOTAL_PREVIOUS): file_write = open('./TOTAL', 'wb') file_write.write(str(total)) file_write.close() conn = MySQLdb.connect(host="localhost", user="******", passwd="123456", db="newsPool", charset='utf8', use_unicode=True) cursor = conn.cursor() for news in results: Time = str(int(time.time())) keys_list = [ 'id', 'time', 'title', 'content', 'JPY_news', 'JPY_norm', 'JPY_analy', 'CHF_news', 'CHF_norm', 'CHF_analy', 'USD_news', 'USD_norm', 'USD_analy', 'EUR_news', 'EUR_norm', 'EUR_analy', 'GBP_news', 'GBP_norm', 'GBP_analy', 'AUD_news', 'AUD_norm', 'AUD_analy', 'CAD_news', 'CAD_norm', 'CAD_analy', 'RMB_news', 'RMB_norm', 'RMB_analy', 'gold', 'silver', 'crude', 'bond', 'importance' ] keys = ','.join(keys_list) values_str_list_tmp = [ news[u'id'], Time, news[u'title'], news[u'contentHtml'] ] value_str_list = [ "\"" + x.encode('utf-8').replace("\"", "\\\"") + "\"" for x in values_str_list_tmp ] kinds_list_tmp = classifier(news[u'title'].encode('utf-8'), '', kinds_flag) + [0] kinds_list = [str(y) for y in kinds_list_tmp] value_list = value_str_list + kinds_list values = ','.join(value_list) sql = "insert into wallstreet_realtime_news (%s) values(%s)" % ( keys, values) try: cursor.execute(sql) except MySQLdb.IntegrityError: pass cursor.close() conn.commit() conn.close() print "update has been appended" else: print "there is no update"
def parse_page(self, response): self.log("Fetch FxNews page: %s" % response.url) item = NewscrawlItem() item["title"] = response.xpath("/html/body/div[7]/div[1]/div[2]/text()").extract()[0].encode("utf-8") item["URL"] = response.url time_common = ( response.xpath("/html/body/div[7]/div[1]/div[3]/span[1]/text()") .extract()[0] .replace(u"\u5e74", u"-") .replace(u"\u6708", u"-") .replace(u"\u65e5", "") .encode("utf-8") + ":00" ) item["time"] = int(time.mktime(time.strptime(time_common, "%Y-%m-%d %H:%M:%S"))) item["content"] = response.xpath("/html/body/div[7]/div[1]/div[4]/div[1]/p[1]").extract()[0].encode("utf-8") item["tags"] = ",".join( response.xpath("/html/body/div[7]/div[1]/div[4]/div[1]/div[2]/div[2]/ul/li/a/text()").extract() ).encode("utf-8") [ item["JPY_news"], item["JPY_norm"], item["JPY_analy"], item["CHF_news"], item["CHF_norm"], item["CHF_analy"], item["USD_news"], item["USD_norm"], item["USD_analy"], item["EUR_news"], item["EUR_norm"], item["EUR_analy"], item["GBP_news"], item["GBP_norm"], item["GBP_analy"], item["AUD_news"], item["AUD_norm"], item["AUD_analy"], item["CAD_news"], item["CAD_norm"], item["CAD_analy"], item["RMB_news"], item["RMB_norm"], item["RMB_analy"], item["gold"], item["silver"], item["crude"], item["bond"], ] = classifier(item["title"], item["tags"], kinds_flag) item["importance"] = 0 return item
def process(): total, results = scrape() file_read = open('./TOTAL', 'rb') TOTAL_PREVIOUS = int(file_read.read().strip()) file_read.close() if detect(total, TOTAL_PREVIOUS): file_write = open('./TOTAL', 'wb') file_write.write(str(total)) file_write.close() conn = MySQLdb.connect(host="localhost",user="******", passwd="123456", db="newsPool", charset='utf8', use_unicode=True) cursor = conn.cursor() for news in results: Time = str(int(time.time())) keys_list = ['id','time','title','content', 'JPY_news','JPY_norm','JPY_analy','CHF_news','CHF_norm','CHF_analy','USD_news','USD_norm','USD_analy', 'EUR_news','EUR_norm','EUR_analy','GBP_news','GBP_norm','GBP_analy','AUD_news','AUD_norm','AUD_analy', 'CAD_news','CAD_norm','CAD_analy','RMB_news','RMB_norm','RMB_analy','gold','silver','crude','bond','importance'] keys = ','.join(keys_list) values_str_list_tmp = [news[u'id'], Time, news[u'title'], news[u'contentHtml']] value_str_list = ["\""+x.encode('utf-8').replace("\"", "\\\"")+"\"" for x in values_str_list_tmp] kinds_list_tmp = classifier(news[u'title'].encode('utf-8'), '', kinds_flag) + [0] kinds_list = [str(y) for y in kinds_list_tmp] value_list = value_str_list + kinds_list values = ','.join(value_list) sql = "insert into wallstreet_realtime_news (%s) values(%s)" % (keys, values) try: cursor.execute(sql) except MySQLdb.IntegrityError: pass cursor.close() conn.commit() conn.close() print "update has been appended" else: print "there is no update"