Beispiel #1
0
def remove_by_url():
    """ 删除208 sum 字段有影响的文本记录, 通过 handle_163_text函数产生的文件中的url """
    count = 0
    verbose_dicts = {}
    files_path = 'D:/temp/data/20151119/'
    coll_208 = Mongodb('192.168.250.208', 27017, 'news', 'topnews_analyse')

    for _docs in coll_208.query({'url': {
            '$regex': re.compile(r'163\.com')
    }}, {'url': 1}):
        verbose_dicts[_docs['url']] = _docs['_id']

    keys_set = {key for key in verbose_dicts}
    for _c, filename in enumerate(os.listdir(files_path), 1):
        with open(files_path + filename) as fp:
            url = fp.readline().strip()

            if url in keys_set:
                count += 1
                object_id = verbose_dicts[url]
                coll_208.update({'_id': object_id}, setdata={'stat': 0})
                print('Order: {}, count: {}, id: {}'.format(
                    _c, count, object_id))
                time.sleep(0.4)
                # break

    coll_208.disconnect()
Beispiel #2
0
 def get_base_stock_code(stock_code):
     coll = Mongodb('192.168.251.95', 27017, 'ada', 'base_stock')
     try:
         for d in coll.query({'tick': stock_code}).sort([('crt', 1)]):
             return d.get('code')
     except Exception as e:
         print 'no get code by base_stock. Error:', e
Beispiel #3
0
 def base_stock_code(self, stock_code):
     coll = Mongodb('192.168.251.95', 27017, 'ada', 'base_stock')
     try:
         for d in coll.query({'tick': stock_code}).sort([('crt', 1)]):
             return d.get('code')
     except Exception as e:
         print 'no get code by base_stock. Error:', e
class FundMapping(object):
    def __init__(self):
        self._collection = Mongodb('192.168.250.200', 27017, 'fund', 'base_fund')
        self._url = 'http://fund.csrc.gov.cn/web/classification_show.organization'

    def get_fund_mapping(self):
        # sub_code, sub_name, main_code, main_name
        sub_to_main_mapping = []
        html = requests.get(self._url, timeout=30.0).content
        document = PyQuery(unicode(html, 'utf-8'))

        fund_blocks = [document.items('.aa'), document.items('.dd')]
        for each_block in fund_blocks:
            for class_tag in each_block:
                items_list = [item.text() for item in class_tag.items('td')]
                sub_to_main_mapping.append((items_list[1], items_list[3]))
        return dict(sub_to_main_mapping)

    def update_to_mongo(self):
        fund_mapping = self.get_fund_mapping()

        for item in self._collection.query(kwargs={'code': 1}).sort([('_id', 1)]):
            key = item['code'][:6]
            main_fund_code = fund_mapping.get(key)
            if main_fund_code is not None:
                regex = re.compile(r'{0}'.format(main_fund_code))
                main_fund_sid = self._collection.get({'code': regex}, {'sid': 1})
                print 'main:', main_fund_sid
                _main = (main_fund_sid or {}).get('sid', '')
                self._collection.update({'_id': item['_id']}, setdata={'main': _main})
Beispiel #5
0
def handle_163_text():
    """ 找出208 sum 字段有影响的文本记录 """
    pattern = re.compile(r'网易财经会赚钱的客户端|网易财经 会赚钱的客户端')
    query_cond = {'url': {'$regex': re.compile(r'163\.com')}, 'ratio': 0}

    coll = Mongodb('192.168.0.223', 27017, 'news_crawl', 'hot_news')
    # print coll.query({'content': {'$regex': re.compile(r'%s' % text)}}).count()
    for k, doc in enumerate(coll.query(query_cond), 1):
        content = doc['content']

        if pattern.search(content) is not None:
            url = doc['url']
            title = doc['title'].split('重点推荐', 1)[0]
            auth = doc['author']
            pub_date = doc['date']
            cat = doc.get('source')
            ratio = doc.get('ratio')

            new_content = pattern.split(content,
                                        1)[0].split('div,h1',
                                                    1)[0].strip('#&# ')

            if cat and new_content:
                lines = [
                    url, title, auth,
                    str(pub_date), new_content, cat,
                    str(ratio)
                ]
                write(path + '20151119/', str(pub_date), lines)

            print 'id:', doc['_id'], k
    coll.disconnect()
Beispiel #6
0
 def get_rr_research_org_code(origin):
     coll = Mongodb('192.168.251.95', 27017, 'ada', 'rr_research_org')
     try:
         for doc in coll.query({'abbr.szh': {'$regex': origin}}):
             if doc['abbr']['szh'] == origin or origin in doc['rs']:
                 return doc['code']
     except Exception as e:
         print 'no get code by origin. Error:', e
Beispiel #7
0
 def rr_research_org_code(self, origin):
     coll = Mongodb('192.168.251.95', 27017, 'ada', 'rr_research_org')
     try:
         for doc in coll.query({'abbr.szh': {'$regex': origin}}):
             if doc['abbr']['szh'] == origin or origin in doc['rs']:
                 return doc['code']
     except Exception as e:
         print 'no get code by origin. Error:', e
Beispiel #8
0
def csf_news():
    coll200 = Mongodb('192.168.250.208', 27017, 'news', 'new_keyword_dict')
    coll_csf = Mongodb('192.168.250.208', 27017, 'news', 'csf_dict')

    for k, doc in enumerate(coll200.query(), 1):
        word = doc['word']
        coll_csf.insert({'word': word, 'nat': 0, 'stat': 2, 'w': 1000})
        print k
    coll200.disconnect()
    coll_csf.disconnect()
Beispiel #9
0
def statistics(months=None, weeks=None, days=None):
    if months:
        query_range = str(datetime.now() - timedelta(days=30)).replace(
            '-', '')[:8]
    elif weeks:
        query_range = str(datetime.now() - timedelta(days=7)).replace('-',
                                                                      '')[:8]
    elif days:
        pass

    coll_from = Mongodb('192.168.250.208', 27017, 'news', 'hotnews_analyse')
    coll_to = Mongodb('192.168.250.208', 27017, 'news', 'statistics')

    all_ind = {
        _ind
        for _doc in coll_from.query(kwargs={'ind': 1})
        for _ind in _doc.get('ind', [])
    }

    for ind in all_ind:
        counter = Counter()
        query_cond = {
            'ind': {
                '$in': [ind]
            },
            'dt': {
                '$gte': query_range + '000000'
            }
        }

        for doc in coll_from.query(query_cond, {'kw': 1}):
            counter.update(doc.get('kw', []))

        data = {
            'ind': ind,
            'count': counter.most_common(100),
            'dt': query_range
        }
        coll_to.insert(data)
    coll_from.disconnect()
    coll_to.disconnect()
Beispiel #10
0
def update_item_sipo():
    coll = Mongodb('192.168.0.223', 27017, 'py_crawl', 'sipo_typ')
    headers_fmgb = [
        'tit', 'sqgbh(申请公布号)', 'sqgbr(申请公布日)', 'sqh(申请号)', 'sqr_day(申请日)',
        'sqr_person(申请人)', 'fmr(发明人)', 'dz(地址)', 'flh(分类号)', 'zy(摘要)',
        'zldljg(专利代理机构)', 'dlr(代理人)', 'yxq(优先权)', 'PCTjrgjjdr(PCT进入国家阶段日)',
        'PCTsqsj(PCT申请数据)', 'PCTgbsj(PCT公布数据)', 'gzwxcbr(更正文献出版日)',
        'swbc(生物保藏)', 'faysq(分案原申请)', 'bgyxq(本国优先权)'
    ]

    headers_syxx = [
        'tit', 'sqggh(授权公告号)', 'sqggr(授权公告日)', 'sqh(申请号)', 'sqr_day(申请日)',
        'zlqr(专利权人)', 'fmr(发明人)', 'dz(地址)', 'flh(分类号)', 'zy(摘要)',
        'zldljg(专利代理机构)', 'dlr(代理人)', 'yxq(优先权)', 'PCTjrgjjdr(PCT进入国家阶段日)',
        'PCTsqsj(PCT申请数据)', 'PCTgbsj(PCT公布数据)', 'gzwxcbr(更正文献出版日)',
        'faysq(分案原申请)', 'bgyxq(本国优先权)'
    ]
    open_book_fmgb = XlsxWriter(path + 'sipo_fmgb.xlsx', 'fmgb', headers_fmgb)
    keys_fmsq = [
        'tit', 'sqgbh', 'sqgbr', 'sqh', 'sqr_day', 'sqr_person', 'fmr', 'dz',
        'flh', 'zy', 'zldljg', 'dlr', 'yxq', 'PCTjrgjjdr', 'PCTsqsj',
        'PCTgbsj', 'gzwxcbr', 'swbc', 'faysq', 'bgyxq'
    ]
    for k, dct in enumerate(coll.query({'type': 'fmgb'}).sort([('_id', 1)])):
        open_book_fmgb.write([dct.get(key, '') for key in keys_fmsq])
        print 'fmgb:', k
    open_book_fmgb.close()

    open_book_syxx = XlsxWriter(path + 'sipo_syxx.xlsx', 'syxx', headers_fmgb)
    keys_syxx = [
        'tit', 'sqggh', 'sqggr', 'sqh', 'sqr_day', 'zlqr', 'fmr', 'dz', 'flh',
        'zy', 'zldljg', 'dlr', 'yxq', 'PCTjrgjjdr', 'PCTsqsj', 'PCTgbsj',
        'gzwxcbr', 'faysq', 'bgyxq'
    ]
    for ks, dct in enumerate(coll.query({'type': 'syxx'}).sort([('_id', 1)])):
        open_book_syxx.write([dct.get(key, '') for key in keys_syxx])
        print 'syxx', ks
    open_book_syxx.close()
Beispiel #11
0
def filter_titles(db_path):
    month = str(date.today()).split('-')[1]
    year_mon = ''.join(str(date.today()).split('-')[:-1])
    days = {
        '01': '31',
        '02': '28',
        '03': '31',
        '04': '30',
        '05': '31',
        '06': '30',
        '07': '31',
        '08': '31',
        '09': '30',
        '10': '31',
        '11': '30',
        '12': '31'
    }
    coll = Mongodb('192.168.0.212', 27017, 'arvin', 'finance_news_all')
    condition = {
        'date': {
            '$gte': long(year_mon + '01000000'),
            '$lte': long(year_mon + days.get(month) + '232359')
        }
    }

    ########################################################
    print 'db `title` is loading now, waiting .......'
    ########################################################

    filedb = FileBsd('hash', db_path)
    for k, doc in enumerate(coll.query(condition)):
        try:
            filedb.put(md5(doc['title']))
        except Exception as e:
            print 'filter_titles error:', e
    coll.disconnect()
    filedb.close()

    #####################################################
    print 'title filter loading finished'
Beispiel #12
0
def filter_titles(db_path):
    month = str(date.today()).split('-')[1]
    year_mon = ''.join(str(date.today()).split('-')[:-1])
    days = {'01': '31', '02': '28', '03': '31', '04': '30', '05': '31', '06': '30', '07': '31', '08': '31',
            '09': '30', '10': '31', '11': '30', '12': '31'}
    coll = Mongodb('192.168.0.212', 27017, 'arvin', 'finance_news_all')
    condition = {'date': {'$gte': long(year_mon + '01000000'), '$lte': long(year_mon + days.get(month) + '232359')}}

    ########################################################
    print 'db `title` is loading now, waiting .......'
    ########################################################

    filedb = FileBsd('hash', db_path)
    for k, doc in enumerate(coll.query(condition)):
        try:
            filedb.put(md5(doc['title']))
        except Exception as e:
            print 'filter_titles error:', e
    coll.disconnect()
    filedb.close()

    #####################################################
    print 'title filter loading finished'