Example #1
0
def generate_samples(output_fname):
    """"""
    mg_conn = MongoSource()
    conn = mg_conn.get_connection("stock", "au99")

    stock_value_dic = dict(
        [
            (i, (item["date"], get_price(item)))
            for i, item in enumerate(conn.find({"date": {"$gt": 20130101}}).sort("date"))
        ]
    )
    date_now = int(datetime.now().strftime("%Y%m%d"))
    print "Samples Count: ", len(stock_value_dic)
    with open(output_fname, "w") as writer:
        for idx, c_price in sorted(stock_value_dic.iteritems(), key=lambda x: x[0]):
            date, price = c_price
            after_idx = idx + AFTER_DAYS
            if after_idx in stock_value_dic:
                after_price = max([stock_value_dic[idx + i][1] for i in xrange(1, AFTER_DAYS + 1)])
                if after_price > c_price[1]:
                    label = 1
                else:
                    label = 0
                writer.write("%s %s %s %s %.2f\n" % (idx, label, date, price, after_price))
            else:
                label = -1
                writer.write("%s %s %s %s %.2f\n" % (idx, label, date, price, 0.0))
Example #2
0
def run_seg(pid, core_cnt, output_fname):
    ''''''
    mg_conn = MongoSource()
    conn_lst = [mg_conn.get_connection('finance', 'golden_pages'),\
            mg_conn.get_connection('finance', 'usa_pages')]

    cnt = 0
    with open(output_fname, 'w') as fp:
        for conn in conn_lst:
            for i, page in enumerate(conn.find()):
                if i % core_cnt != pid:
                    continue
                if cnt % 20000 == pid:
                    print 'Process %s dealed %s pages' % (pid, cnt)
                cnt += 1

                page_id = page['_id']
                if type(page['article_date']) == type(int):
                    article_date = str(page['article_date'])
                else:
                    article_date = page['article_date'].encode('utf8')
                article_date = article_date[:-4]
                title = strip_words(page['title'])
                content = strip_words(page['content'])

                cut_words = [word for word in jieba.cut(title)]
                cut_words.extend([word for word in jieba.cut(content)])

                output_str = '%s\t%s\t%s\n' \
                        % (page_id.encode('utf8'), article_date, ' '.join(cut_words).encode('utf8'))
                fp.write(output_str)
Example #3
0
 def __load_source_words(self):
     """"""
     mg_conn = MongoSource()
     pages_conn = mg_conn.get_connection("finance", "golden_pages")
     return set([word.get("source", "").encode("utf8") for word in pages_conn.find()])