Exemple #1
0
class NewsCounter:
    def __init__(self,sleep_time = 60):
        self.counter_thread = threading.Thread(target = self.run,
                                               name = 'Counter')
        self.recent_data = None
        self.sleep_time = sleep_time
        
        self._init_counter()
        
    def _init_counter(self):
        self.engine_obj = DatabaseEngine('xiaoyi')
        self.engine = self.engine_obj.get_engine()
        self.session = self.engine_obj.get_session()
        
    def start(self):
        self.counter_thread.start()
        
    def join(self):
        self.counter_thread.join()
        
    def run(self):
        print 'News Counter starts to run...'
        counter_birth_time = time.time()
        while time.time() - counter_birth_time < SPIDER_LIVE_TIME:
            self.recent_data = pd.read_sql(SQL_GET_MACRO_NEWS_TOP,self.engine)
            self.recent_data_group = self.recent_data.groupby('unique_flag',
                                                         sort = False)
            self.news_repeat = self.recent_data_group.count()
            self.news_repeat = self.news_repeat.reset_index()
            self.news_repeat.rename(columns = {'news_source':'repeat_times'},
                                    inplace = True)
            self.news_repeat.drop_duplicates(subset = ['unique_flag'],
                                             inplace = True)
            # write
            self.news_repeat.to_sql('news_repeat',self.engine,index = False,
                                    if_exists = 'replace',
                                    dtype = {'unique_flag':VARCHAR(256),
                                             'repeat_times':INT})
            self.session.execute(SQL_MERGE_MACRO_NEWS_REPEAT_TIMES)
            self.session.commit()            
            time.sleep(self.sleep_time)
Exemple #2
0
    SQL_GET_SW_INDEX_STOCK_COMPONENTS, SQL_GET_FUND_INFER_INDUSTRY,
    SQL_GET_FUND_BASIC, SQL_GET_FUND_SCORE, SQL_GET_FUND_NET_VALUE,
    SQL_GET_TRADE_CALENDAR, SQL_GET_FUND_INFER_INDUSTRY_CURRENT,
    SQL_GET_FUNDS_NET_VALUE, SQL_GET_FUND_MANAGER, SQL_GET_MANAGER_FUND,
    SQL_GET_STOCK_BASICS, SQL_GET_STOCK_DAILY_DATA1, SQL_GET_STOCK_DAILY_DATA2,
    SQL_GET_STOCK_MIN_CLOSE, SQL_GET_STOCK_FEATURES, SQL_GET_FUNDS_DAILY_RET,
    SQL_GET_ALL_FUNDS_DAILY_RET, SQL_GET_INDEX_DAILY, SQL_GET_RISK_FREE_RATE,
    SQL_GET_STOCK_ADJFACTOR)
from consts import SW_INDUSTRY_FIRST_CODE

engine_ld_obj = DatabaseEngine('ld')
engine_ld = engine_ld_obj.get_engine()

engine_xiaoyi_obj = DatabaseEngine('xiaoyi')
engine_xiaoyi = engine_xiaoyi_obj.get_engine()
session_xiaoyi = engine_xiaoyi_obj.get_session()

engine_wind_obj = DatabaseEngine('wind')
engine_wind = engine_wind_obj.get_engine()

engine_gb_obj = DatabaseEngine('geniusbar')
engine_gb = engine_gb_obj.get_engine()
session_gb = engine_gb_obj.get_session()

engine_xiaoyi40_obj = DatabaseEngine('xiaoyi40')
engine_xiaoyi40 = engine_xiaoyi40_obj.get_engine()
session_xiaoyi40 = engine_xiaoyi40_obj.get_session()

engine_guosen_obj = DatabaseEngine('guosen')
engine_guosen = engine_guosen_obj.get_engine()
session_guosen = engine_guosen_obj.get_session()
Exemple #3
0
class NewsSpider(object):
    def __init__(self,spider_name,source_name,title_url,lock,
                 update_seconds = 30):
        '''
        新闻爬虫工厂。
        生产流程:
            1. 实现parse_title生成self.additions
            2. 实现parse_content,将相关内容添加到self.additions.  
        
        Parameters
        ----------
        spider_name
            爬虫名称,英文
        source_name
            新闻源名称,unicode
        title_url
            标题页url
        lock
            线程共享锁
        update_seconds
            int,基础更新间隔
        if_header
            int,0 或 1 是否头条爬虫
        '''
        self.spider_name = spider_name
        self.source_name = source_name
        self.title_url = title_url
        self.lock = lock
        self.update_seconds = update_seconds
        self.if_header = 0
        
    def init_spider(self,if_connect = True): 

        self.header_generator = header_generator
        
        if if_connect:
            self.engine_obj = DatabaseEngine('xiaoyi')
            self.engine = self.engine_obj.get_engine()
            self.session = self.engine_obj.get_session()
        
        self.additions = pd.DataFrame()
        self.spider_thread = threading.Thread(target = self.spider_data,
                                              name = self.spider_name)
        
        # 最近更新标记
        self.last_flag = None # 最近更新,用来进行增量判断,可以是时间等
        print 'Spider %s initilize succesfully'%self.spider_name 
        
    def spider_data(self):
        print 'Spider %s starts to run...'%self.spider_name
        spider_birth_time = time.time()
        while time.time() - spider_birth_time < SPIDER_LIVE_TIME:
            # 获取标题页
            try:
                status_code = self._get_titles_response()
                
                if status_code != 200:
                    continue
                
                # ----------- 标题页分析,确定增量更新内容self.additions -----------
                # additions.columns = [title,href,news_time,content,news_source]
                # 保证additions都有上述的column,没有内容的用None
                try:
                    self._parse_titles_response()
                except Exception as e:
                        print 'Spider %s failed to parse TITLE'%self.spider_name
                        print 'Error:'
                        print e
                        print '*' * 40
                        self._have_a_short_rest()
                        continue
                # ------------------------------------------------------------------
                
                
                if len(self.additions) == 0:
                    self._have_a_rest()
                    continue
                    
                for idx,row in self.additions.iterrows():
                    href = row['href']
                    status_code = self._get_content_response(href)
                    if status_code != 200:
                        self._have_a_short_rest()
                        continue
                    try:
                        self._parse_content_response(idx)
                    except Exception as e:
                        print 'Spider %s failed to parse content'%self.spider_name
                        print 'The href is %s'%href
                        print 'Error:'
                        print e
                        print '*' * 40
                    
                
                self._add_unique_flag()
                self._check_additions()
                
                if len(self.additions) == 0:
                    self._have_a_rest()
                    continue
                
                self._add_header_flag()            
    
                self._write_into_db()
                self._have_a_rest()
            except Exception as e:
                print '[%s]Error in %s'%(dt.datetime.today().strftime('%Y-%m-%d %H:%M:%S'),
                       self.spider_name)            
                print 'Error: %s'%e
                break
        

    def _get_titles_response(self):
        '''
        获取标题页respsonse.
        '''
        self.titles_response = requests.get(self.title_url,headers = self.header_generator())
        return self.titles_response.status_code
    
    def _get_content_response(self,href):
        '''
        获取新闻具体内容response.
        '''
        try:
            self.content_response = requests.get(href,headers = self.header_generator())
            return self.content_response.status_code
        except Exception as e:
            print 'Spider {sp} failed to get content from {web} because: {error}'.format(
                    sp = self.spider_name,web = href,error = str(e))
            return 0
    
    def _parse_titles_response(self):
        '''
        分析标题页response,返回additions(增量更新内容).
        包括标题、新闻链接、时间(可选)、来源.
        '''
        pass
        
    def _parse_content_response(self,idx):
        '''
        解析内容页response.
        '''
        pass
    
    def _check_additions(self):
        '''
        写入数据库前对self.additions内容进行检测,确保没有问题.
        '''
        # 剔除title重复内容
        self.additions = self.additions.sort_values('news_time',
                                                    ascending = False)
        self.additions = self.additions.drop_duplicates(subset = 'unique_flag')
        
        # 剔除空内容
        self.additions = self.additions.dropna(subset = ['content'])
        
        # 剔除content数据量太少的记录
        self.additions['content_len'] = self.additions['content'].apply(lambda x: len(x))
        self.additions = self.additions.loc[self.additions['content_len'] > 3]
        
        self.additions = self.additions.drop('content_len',axis = 1)
        
    def _add_unique_flag(self):
        '''
        为新闻内容添加唯一性标志.实现新闻去重.
        '''
        punctuation_list = [u'?',u'.',u',',u'!',u':',u'"',u' ',u'…',u'%',u'<',u'>',
                           u'?', u'。',u',',u'!',u':',u'”',u'“',u'、',u'《',u'\u3000'
                           u'》']
        def delete_punctuations(title):
            new_title = title
            for punc in punctuation_list:
                new_title = new_title.replace(punc,'')
            return new_title
        self.additions['unique_flag'] = self.additions['title'].apply(delete_punctuations)
        
    def _add_header_flag(self):
        '''
        为新闻内容添加是否头条的字段.
        '''
        self.additions.loc[:,'if_header'] = self.if_header
                
    def _write_into_db(self):
        '''
        写入数据库.
        '''

        self.lock.acquire() # 防止同一时间有其他进程对数据库进行写入
        # ---------------------
        # 1.
        self.additions.to_sql('news_macro_source',self.engine,
                              index = False,if_exists = 'replace',
                         dtype = {'title':VARCHAR(256),
                                  'news_time':VARCHAR(64),
                                  'href':VARCHAR(256),
                                  'content':NVARCHAR(),
                                  'update_datetime':DATETIME,
                                  'news_source':VARCHAR(32),
                                  'unique_flag':VARCHAR(256),
                                  'if_header':INT})
        try:
            if not self.if_header:
                self.session.execute(SQL_MERGE_MACRO_NEWS)
                self.session.commit()
            else:
                self.session.execute(SQL_MERGE_MACRO_NEWS_FOR_HEADER)
                self.session.commit()
                
            self.session.execute(SQL_MERGE_MACRO_NEWS_FOR_ALL)
            self.session.commit()
        except pymssql.OperationalError:
            print 'Error of %s: Writing into database failed'%self.spider_name
            
        # ---------------------
        self.lock.release() # 释放
    
    def _have_a_rest(self):
        time.sleep(self.update_seconds + random.randint(0,60))
        
    def _have_a_short_rest(self):
        time.sleep(3)
        
    def start(self):
        self.spider_thread.start()
    
    def join(self):
        self.spider_thread.join()
    
    def test_spider(self,if_write = False):
        self.header_generator = header_generator
        self.last_flag = None
        self.additions = pd.DataFrame()
        status_code = self._get_titles_response()    
        print 'TITLE RESPONSE STATUS: %s'%(str(status_code))
        # ----------- 标题页分析,确定增量更新内容self.additions -----------
        # additions.columns = [title,href,news_time,content,news_source]
        # 保证additions都有上述的column,没有内容的用None
        self._parse_titles_response()
        # ------------------------------------------------------------------
            
        for idx,row in self.additions.iterrows():
            href = row['href']
            status_code = self._get_content_response(href)
            print 'CONTENT RESPONSE STATUS: %s'%(str(status_code))
            self._parse_content_response(idx)
            
        if if_write:
            self.engine_obj = DatabaseEngine('xiaoyi')
            self.engine = self.engine_obj.get_engine()
            self.session = self.engine_obj.get_session()
            self.additions.to_sql('news_macro_source',self.engine,
                                  index = False,if_exists = 'replace',
                             dtype = {'title':VARCHAR(256),
                                      'news_time':VARCHAR(64),
                                      'href':VARCHAR(256),
                                      'content':NVARCHAR(),
                                      'update_datetime':DATETIME,
                                      'news_source':VARCHAR(32)})
            self.session.execute(SQL_MERGE_MACRO_NEWS)
            self.session.commit()
        self.last_flag = None
Exemple #4
0
punctuation_list = ['\xa0'.decode('latin1'),u'?',u'.',u',',u'!',u':',u'"',u' ',u'…',u'%',u'<',u'>',
                           u'?', u'。',u',',u'!',u':',u'”',u'“',u'、',u'《',
                           u'》']

def delete_punctuations(title):
    new_title = title
    for punc in punctuation_list:
        new_title = new_title.replace(punc,'')
    return new_title
data['unique_flag'] = data['title'].apply(delete_punctuations)
data = data[['title','unique_flag']]


engine_obj = DatabaseEngine('xiaoyi')
engine = engine_obj.get_engine()
session = engine_obj.get_session()
data.to_sql('news_macro_source1',engine,
                      index = False,if_exists = 'replace',
                 dtype = {'title':VARCHAR(256),
                          'unique_flag':VARCHAR(256)})
    
    
tmp_sql = '''
MERGE INTO news_macro as T
USING news_macro_source1 as S
ON T.title = S.title
WHEN MATCHED
THEN UPDATE SET
unique_flag = S.unique_flag;
'''
Exemple #5
0
# -*- coding: utf-8 -*-
"""
Created on Wed Feb 13 10:23:56 2019

@author: ldh
"""

# api.py

from __future__ import print_function
from database_engine import DatabaseEngine

engine_guosen_obj = DatabaseEngine('guosen')
engine_guosen = engine_guosen_obj.get_engine()
session_guosen = engine_guosen_obj.get_session()


def index_daily(index_code, start_date, end_date):
    '''
    指数日行情。
    
    Parameters
    ----------
    index_code
        指数代码
    start_date
        开始日期
    end_date
        结束日期
        
    Returns
Exemple #6
0
def collect_data_from_ths():
    engine_obj = DatabaseEngine('xiaoyi')
    engine = engine_obj.get_engine()
    session = engine_obj.get_session()
    url_ths = 'http://news.10jqka.com.cn/cjzx_list/index_1.shtml'
    last_news_time = None

    print 'THS collecter starts RUNNING...'
    while True:

        # 标题页数据获取
        response = requests.get(url_ths, headers=header_generator())

        if response.status_code == 200:
            pass
        else:
            time.sleep(retry_seconds + random.randint(0, 30))
            continue

        # 标题页数据解析
        soup = BeautifulSoup(response.content, 'html.parser')

        content = soup.find_all(class_='content-1200')
        content = content[1]
        content = content.find_all(class_='module-l fl')
        content = content[0]
        content = content.find_all(class_='list-con')
        content = content[0]
        content_titles = content.find_all(class_='arc-title')

        titles = []
        news_time = []
        hrefs = []

        for each in content_titles:
            tmp = each.find('a')
            titles.append(tmp.attrs['title'])
            news_time.append(each.find('span').text)
            hrefs.append(tmp.attrs['href'])

        news_time_adj = map(ths_news_time_convertor, news_time)

        # 判断是否有新的新闻写入
        news_df = pd.DataFrame([titles, news_time_adj, hrefs],
                               index=['title', 'news_time', 'href']).T
        news_df['news_source'] = u'同花顺'
        if last_news_time is None:
            last_news_time = max(news_time_adj)
            additions = news_df
        else:
            latest_news_time = max(news_time_adj)
            if latest_news_time <= last_news_time:
                time.sleep(update_seconds + random.randint(0, 30))
                continue  # 无更新内容
            else:
                # 确定增量更新内容
                additions = news_df.loc[news_df['news_time'] > last_news_time]
                additions['content'] = None
                last_news_time = latest_news_time

        # 增量爬取
        for idx, row in additions.iterrows():
            link = row['href']
            link_response = requests.get(link, headers=header_generator())
            if link_response.status_code == 404:
                continue
            tmp_soup = BeautifulSoup(link_response.content, 'html.parser')
            main_text = tmp_soup.find_all(class_='main-text atc-content')
            main_text = main_text[0]
            main_text_list = main_text.find_all('p')
            main_text_content = [each.text for each in main_text_list][1:-1]
            tmp_content = '\n'.join(main_text_content)
            last_stop_point = tmp_content.rfind(u'。')
            filtered_content = tmp_content[:last_stop_point + 1]
            additions.loc[idx, 'content'] = filtered_content

        # 爬取内容增量写入数据库
        additions['update_datetime'] = dt.datetime.today()
        lock.acquire()  # 防止同一时间有其他进程对数据库进行写入
        additions.to_sql('news_macro_source',
                         engine,
                         index=False,
                         if_exists='replace',
                         dtype={
                             'title': VARCHAR(256),
                             'news_time': VARCHAR(64),
                             'href': VARCHAR(256),
                             'content': TEXT,
                             'update_datetime': DATETIME,
                             'news_source': VARCHAR(32)
                         })
        session.execute(SQL_MERGE_MACRO_NEWS)
        session.commit()
        lock.release()

        # 休眠
        time.sleep(update_seconds + random.randint(0, 30))
Exemple #7
0
def collect_data_from_jrj():
    '''
    金融界数据爬取.
    '''
    engine_obj = DatabaseEngine('xiaoyi')
    engine = engine_obj.get_engine()
    session = engine_obj.get_session()

    url_jrj = 'http://finance.jrj.com.cn/list/guoneicj.shtml'
    last_news_title = None
    print 'JRJ collecter starts RUNNING...'

    while True:

        # 获取标题页
        response = requests.get(url_jrj, headers=header_generator())
        if response.status_code == 200:
            pass
        else:
            time.sleep(retry_seconds + random.randint(0, 30))
            continue
        soup = BeautifulSoup(response.content, 'html.parser')

        content = soup.find('ul', class_='list2')

        # 标题页解析
        titles = []
        hrefs = []

        content_list = content.find_all('li')

        for tag in content_list:
            if 'class' in tag.attrs.keys():
                continue
            tag1 = tag.find('a')

            titles.append(tag1.text)
            hrefs.append(tag1.attrs['href'])

        news_df = pd.DataFrame([titles, hrefs], index=['title', 'href']).T
        news_df['news_source'] = u'金融界'

        # 判断是否有新的新闻写入(根据标题)
        if last_news_title is None:
            additions = news_df
        else:
            title_list = news_df['title'].tolist()
            try:
                last_title_idx = title_list.index(last_news_title)
                if last_title_idx == 0:
                    time.sleep(update_seconds + random.randint(0, 30))
                    continue
                additions = news_df.loc[:last_title_idx]
            except:
                additions = news_df
        last_news_title = additions['title'].iloc[0]
        additions['content'] = None
        additions['news_time'] = None

        # 增量爬取
        for idx, row in additions.iterrows():
            link = row['href']
            link_response = requests.get(link, headers=header_generator())
            if link_response.status_code == 404:
                continue
            tmp_soup = BeautifulSoup(link_response.content, 'html.parser')

            # 获取文本
            main_text = tmp_soup.find('div', class_='texttit_m1')
            filtered_content = main_text.text
            last_stop_point = filtered_content.rfind(u'。')
            filtered_content = filtered_content[:last_stop_point + 1]
            additions.loc[idx, 'content'] = filtered_content

            # 获取时间
            news_time = tmp_soup.find(
                text=re.compile(r'\d{4}-\d{2}-\d{2}'))[2:]
            additions.loc[idx, 'news_time'] = news_time[:-3]

        # 爬取内容增量写入数据库
        additions['update_datetime'] = dt.datetime.today()

        lock.acquire()  # 防止同一时间有其他进程对数据库进行写入
        additions.to_sql('news_macro_source',
                         engine,
                         index=False,
                         if_exists='replace',
                         dtype={
                             'title': VARCHAR(256),
                             'news_time': VARCHAR(64),
                             'href': VARCHAR(256),
                             'content': TEXT,
                             'update_datetime': DATETIME,
                             'news_source': VARCHAR(32)
                         })
        session.execute(SQL_MERGE_MACRO_NEWS)
        session.commit()
        lock.release()
        # 休眠
        time.sleep(update_seconds + random.randint(0, 30))
Exemple #8
0
def collect_data_from_cnfol():
    '''
    中金在线数据爬取.
    '''
    engine_obj = DatabaseEngine('xiaoyi')
    engine = engine_obj.get_engine()
    session = engine_obj.get_session()

    url_cnfol = 'http://news.cnfol.com/guoneicaijing/'
    last_news_time = None
    print 'CNFOL collecter starts RUNNING...'

    while True:

        # 获取标题页
        response = requests.get(url_cnfol, headers=header_generator())
        if response.status_code == 200:
            pass
        else:
            time.sleep(retry_seconds + random.randint(0, 30))
            continue
        soup = BeautifulSoup(response.content, 'html.parser')

        content = soup.find('ul', class_='TList')

        # 标题页解析
        titles = []
        hrefs = []
        news_time = []

        content_list = content.find_all('li')

        for tag in content_list:
            tag1 = tag.find('a')

            titles.append(tag1.attrs['title'])
            hrefs.append(tag1.attrs['href'])
            news_time.append(cnfol_news_time_convertor(tag.find('span').text))

        news_df = pd.DataFrame([titles, news_time, hrefs],
                               index=['title', 'news_time', 'href']).T
        news_df['news_source'] = u'中金在线'

        # 判断是否有新的新闻写入
        if last_news_time is None:
            last_news_time = max(news_time)
            additions = news_df
        else:
            latest_news_time = max(news_time)
            if latest_news_time <= last_news_time:
                time.sleep(update_seconds + random.randint(0, 30))
                continue  # 无更新内容
            else:
                # 确定增量更新内容
                additions = news_df.loc[news_df['news_time'] > last_news_time]
                additions['content'] = None
                last_news_time = latest_news_time

        # 增量爬取
        for idx, row in additions.iterrows():
            link = row['href']
            link_response = requests.get(link, headers=header_generator())
            if link_response.status_code == 404:
                continue
            tmp_soup = BeautifulSoup(link_response.content, 'html.parser')
            main_text = tmp_soup.find('div', class_='Article')
            filtered_content = main_text.text
            additions.loc[idx, 'content'] = filtered_content

        # 爬取内容增量写入数据库
        additions['update_datetime'] = dt.datetime.today()
        lock.acquire()  # 防止同一时间有其他进程对数据库进行写入
        additions.to_sql('news_macro_source',
                         engine,
                         index=False,
                         if_exists='replace',
                         dtype={
                             'title': VARCHAR(256),
                             'news_time': VARCHAR(64),
                             'href': VARCHAR(256),
                             'content': TEXT,
                             'update_datetime': DATETIME,
                             'news_source': VARCHAR(32)
                         })
        session.execute(SQL_MERGE_MACRO_NEWS)
        session.commit()
        lock.release()

        # 休眠
        time.sleep(update_seconds + random.randint(0, 30))