class MongoDBTempInterface(TempArticleInterface):

    def __init__(self,page_list = 100):

        self.obj = MongoDBConnection()
        self.temp_collection = self.obj.dbConnectionTemp()
        self.collection_postive = self.obj.dbConnection()
        self.collection_negative = self.obj.dbConnectionNegative()
        #self.last_objectid = None
        self.page_list = page_list
        self.stop = False
        self.dict_time = {}
        self.loop = 2
        self.filename = db_settings.MONGO_LOG_FILE + str(datetime.date.today())

    def extractPagination(self,pagenumber):
    
        try:
            
            if self.stop:
                return None
            data = self.temp_collection.find({},no_cursor_timeout=True).hint([("article_publish_time",pymongo.ASCENDING)]).skip(pagenumber*self.page_list).limit(self.page_list)
            
            if data.count(True) < self.page_list:
                self.stop = True
            return data
        except BaseException,error:
            self.writeerrorlog(error)
            return None
Beispiel #2
0
def syns_data(startTimeStr):
    # 将数据同步到es中
    try:
        mongoDB = MongoDBConnection("articles_testN")
        logging.info("[sysn_data] startTimeStr:" + startTimeStr)
        endTime = datetime.datetime.now()
        endTimeStr = endTime.strftime('%Y-%m-%d')
        curTime = datetime.datetime.strptime(startTimeStr, "%Y-%m-%d")

        print "startTime:", curTime
        print "endTime", endTimeStr
        es = ES()
        while curTime <= endTime:
            curTimeStr = curTime.strftime('%Y-%m-%d')
            coll = mongoDB.dbConnection()
            logging.info("[sysn_data] ** time: " + curTimeStr)
            print "******  time:", curTimeStr
            data_list = coll.find({'article_publish_time': curTimeStr},
                                  no_cursor_timeout=True)
            mongoDB.dbClose()
            for one_data in data_list:
                url = "http://localhost:9200/news_spider_db/articles_testN/" + str(
                    one_data["_id"])
                print one_data["_id"]
                one_data.pop("_id")
                result = es.put(url, one_data)
                print result
            curTime = curTime + datetime.timedelta(days=1)
    except BaseException, e:
        logging.error(e)
Beispiel #3
0
    def __init__(self, re_spider_log, collection_name=""):

        self.obj = MongoDBConnection()

        operation.__init__(self, re_spider_log)

        self.stop = False
class DataUtil(object):
    def __init__(self, label):
        self.mongoDB = MongoDBConnection(label)

    def filter_data(self, data_list, label):
        new_datalist = []
        for data in data_list:
            newdata = {}
            newdata["processed_content"] = self.del_unknowIndex(
                data['processed_content'])
            newdata["processed_title"] = self.del_unknowIndex(
                data['processed_title'])
            newdata["label"] = label
            new_datalist.append(newdata)
        return new_datalist

    def del_unknowIndex(self, enum):
        #过滤unknow的值
        news_list = []
        for x in enum:
            if x != 511001:
                news_list.append(x)
        return news_list

    def get_data(self):
        #获取所有的正例和负例
        try:
            logging.info("get_data()")
            coll = self.mongoDB.dbConnection()
            positive_data = coll.find({
                'article_label': 1,
                'article_label_state': {
                    '$gte': 1
                }
            })
            pos_data = self.filter_data(positive_data, 1)
            pos_length = len(pos_data)
            logging.info("pos_length=[" + str(pos_length) + ']')
            print "positive data num:", pos_length
            if pos_length == 0:
                return []

            negative_data = coll.find({
                'article_label': 0,
                'article_label_state': {
                    '$gte': 1
                }
            }).limit(pos_length)
            neg_data = self.filter_data(negative_data, 0)
            self.mongoDB.dbClose()
            data = {}
            data['pos'] = pos_data
            data['neg'] = neg_data
            return data
        except BaseException, e:
            logging.error(e)
            return {}
class ArticleDAO(object):
    def __init__(self, label):
        self.mongoDB = MongoDBConnection(label)

    def article_search_list(self, search_condition):
        try:
            article_source = search_condition['article_source']
            article_db = search_condition['article_db']
            article_label_state = search_condition['article_label_state']
            startTime = search_condition['startTime']
            endTime = search_condition['endTime']
            current_page = search_condition['current_page']
            page_size = search_condition['page_size']
            coll = self.mongoDB.dbConnection()

            condition = {
                "article_label_state": article_label_state,
                'article_publish_time': {
                    '$gte': startTime,
                    '$lte': endTime
                }
            }
            if len(article_source) != 0:
                # 如果列表长度为0则查询所有网站新闻
                condition['article_source'] = {'$in': article_source}

            if article_db == 1:
                condition['is_repeate'] = 0

            if search_condition.has_key("update_student"):
                condition['update_student'] = search_condition[
                    'update_student']

            result = coll.find(condition).skip(page_size *
                                               current_page).limit(page_size)
            self.mongoDB.dbClose()

            article_list = []
            for article in result:
                article['_id'] = str(article['_id'])
                article['id'] = article['_id']
                article.pop('article_discuss')
                article.pop('processed_content')
                article.pop('processed_title')
                article.pop('article_attend_number')
                article.pop('article_discuss_number')
                article.pop('article_discuss')
                article['article_content'] = self._remove_htmlTags(
                    article['article_content'])
                article['article_content'] = article['article_content'][0:240]
                article_list.append(article)
            return article_list
        except BaseException, e:
            logging.error(e)
            return None
    def __init__(self,page_list = 100):

        self.obj = MongoDBConnection()
        self.temp_collection = self.obj.dbConnectionTemp()
        self.collection_postive = self.obj.dbConnection()
        self.collection_negative = self.obj.dbConnectionNegative()
        #self.last_objectid = None
        self.page_list = page_list
        self.stop = False
        self.dict_time = {}
        self.loop = 2
        self.filename = db_settings.MONGO_LOG_FILE + str(datetime.date.today())
Beispiel #7
0
class TagsDAO(object):
    def __init__(self):
        self.mongoDB = MongoDBConnection('articles_tags')
        pass

    def addTag(self, tag):
        try:
            if self.find(tag):
                return True
            information = {"tag": tag}
            coll = self.mongoDB.dbConnection()
            coll.insert(information)
            self.mongoDB.dbClose()
            return True
        except BaseException, e:
            logging.error(e)
            return False
class SystemSetting:

    def __init__(self):
        self.mongoDB = MongoDBConnection("system_setting")

    def lock_model_training(self):
        # 模型训练的时候锁住,防止多次触发模型训练,导致内存、cpu占用过高
	try:
	    coll = self.mongoDB.dbConnection()
            result = coll.find_one({"key":"classifier_model_training_lock" }) 
	    if result is None:
                coll.insert({"key":"classifier_model_training_lock", "value":True})
	    else:
		coll.update({"key":"classifier_model_training_lock"}, \
		    		{"$set":{"value":True}})
	    self.mongoDB.dbClose()
	    return True
        except BaseException, e:
            logging.error(e)
            return False
Beispiel #9
0
class news_operation(operation):
    def __init__(self, re_spider_log, collection_name=""):

        self.obj = MongoDBConnection()

        operation.__init__(self, re_spider_log)

        self.stop = False

    def reset_para(self):

        self.stop = False

    def search_news_page(
        self,
        condition,
        page_number=10,
        page=0,
    ):

        try:
            if self.stop:
                return None
            self.collection_negative = self.obj.dbConnectionNegative()
            data = self.collection_negative.find(
                condition, {
                    "_id": 1,
                    "article_url": 1
                },
                no_cursor_timeout=True).hint([
                    ("article_publish_time", pymongo.ASCENDING)
                ]).skip(page_number * page).limit(page_number)

            if data.count(True) < page_number:
                self.stop = True
            return data
        except BaseException, error:
            self.write_log(error)
            return None
        finally:
 def __init__(self, label):
     self.mongoDB = MongoDBConnection(label)
Beispiel #11
0
 def __init__(self):
     self.mongoDB = MongoDBConnection('articles_tags')
     pass
 def __init__(self):
     self.mongoDB = MongoDBConnection('user')
class UserDAO(object):
    def __init__(self):
        self.mongoDB = MongoDBConnection('user')

    def addUser(self, user):
        try:
            information = {
                "username": user.username,
                "password": user.password,
                "role": user.role
            }
            coll = self.mongoDB.dbConnection()
            coll.insert(information)
            self.mongoDB.dbClose()
            return True
        except:
            return False

    def isexist(self, user):
        coll = self.mongoDB.dbConnection()
        query_result = coll.find_one({"username": user.username})
        self.mongoDB.dbClose()
        if query_result:
            return True
        else:
            return False

    def login(self, user):
        coll = self.mongoDB.dbConnection()
        query_result = coll.find_one({
            "username": user.username,
            'password': user.password
        })
        print query_result
        self.mongoDB.dbClose()

        if query_result:
            query_result['_id'] = str(query_result['_id'])
            return query_result
        else:
            return None

    def userlist(self):
        coll = self.mongoDB.dbConnection()
        query_result = coll.find()
        self.mongoDB.dbClose()
        userList = []
        for item in query_result:
            item['_id'] = str(item['_id'])
            item['id'] = item['_id']
            #print item
            userList.append(item)
        return userList

    def deluser(self, userId):
        coll = self.mongoDB.dbConnection()
        query_result = coll.remove({"_id": ObjectId(userId)})
        self.mongoDB.dbClose()
        return True
 def __init__(self):
     self.mongoDB = MongoDBConnection("system_setting")
class ArticleDAO(object):
    def __init__(self, label):
        self.mongoDB = MongoDBConnection(label)

    def article_search_list(self, search_condition):
        try:
            logging.info("article_search_list()")
            article_source = search_condition['article_source']
            article_db = search_condition['article_db']
            article_label_state = search_condition['article_label_state']
            startTime = search_condition['startTime']
            endTime = search_condition['endTime']
            current_page = search_condition['current_page']
            page_size = search_condition['page_size']
            article_label = search_condition['article_label']
            timerange_check = search_condition["timerange_check"]
            search_type = search_condition["search_type"]
            tags = search_condition["tags"]

            coll = self.mongoDB.dbConnection()
            condition = {}
            if search_type == "filter_search":
                if timerange_check == 1:
                    condition['article_publish_time'] = {
                        '$gte': startTime,
                        '$lte': endTime
                    }

                if len(article_label_state) != 0:
                    condition['article_label_state'] = {
                        '$in': article_label_state
                    }

                if len(article_label) != 0:
                    condition["article_label"] = {'$in': article_label}

                if len(article_source) != 0:
                    # 如果列表长度为0则查询所有网站新闻
                    condition['article_source'] = {'$in': article_source}
                if len(tags) != 0:
                    orlist = []
                    for tag in tags:
                        orlist.append({"tags": tag.strip()})
                    condition['$or'] = orlist

                if article_db == 1:
                    condition['is_repeate'] = 0

                if search_condition.has_key("update_student"):
                    condition['update_student'] = search_condition[
                        'update_student']

            print "mongodb condition:", condition
            result = coll.find(condition).skip(page_size *
                                               current_page).limit(page_size)
            self.mongoDB.dbClose()

            article_list = []
            for article in result:
                article['_id'] = str(article['_id'])
                article['id'] = article['_id']
                article['article_content'] = self._remove_htmlTags(
                    article['article_content'])
                article['article_content'] = article['article_content'][0:240]
                article_list.append(article)

            return article_list
        except BaseException, e:
            print e
            print traceback.print_exc()
            logging.error(e)
            return []