class MongoDBTempInterface(TempArticleInterface): def __init__(self,page_list = 100): self.obj = MongoDBConnection() self.temp_collection = self.obj.dbConnectionTemp() self.collection_postive = self.obj.dbConnection() self.collection_negative = self.obj.dbConnectionNegative() #self.last_objectid = None self.page_list = page_list self.stop = False self.dict_time = {} self.loop = 2 self.filename = db_settings.MONGO_LOG_FILE + str(datetime.date.today()) def extractPagination(self,pagenumber): try: if self.stop: return None data = self.temp_collection.find({},no_cursor_timeout=True).hint([("article_publish_time",pymongo.ASCENDING)]).skip(pagenumber*self.page_list).limit(self.page_list) if data.count(True) < self.page_list: self.stop = True return data except BaseException,error: self.writeerrorlog(error) return None
def syns_data(startTimeStr): # 将数据同步到es中 try: mongoDB = MongoDBConnection("articles_testN") logging.info("[sysn_data] startTimeStr:" + startTimeStr) endTime = datetime.datetime.now() endTimeStr = endTime.strftime('%Y-%m-%d') curTime = datetime.datetime.strptime(startTimeStr, "%Y-%m-%d") print "startTime:", curTime print "endTime", endTimeStr es = ES() while curTime <= endTime: curTimeStr = curTime.strftime('%Y-%m-%d') coll = mongoDB.dbConnection() logging.info("[sysn_data] ** time: " + curTimeStr) print "****** time:", curTimeStr data_list = coll.find({'article_publish_time': curTimeStr}, no_cursor_timeout=True) mongoDB.dbClose() for one_data in data_list: url = "http://localhost:9200/news_spider_db/articles_testN/" + str( one_data["_id"]) print one_data["_id"] one_data.pop("_id") result = es.put(url, one_data) print result curTime = curTime + datetime.timedelta(days=1) except BaseException, e: logging.error(e)
def __init__(self, re_spider_log, collection_name=""): self.obj = MongoDBConnection() operation.__init__(self, re_spider_log) self.stop = False
class DataUtil(object): def __init__(self, label): self.mongoDB = MongoDBConnection(label) def filter_data(self, data_list, label): new_datalist = [] for data in data_list: newdata = {} newdata["processed_content"] = self.del_unknowIndex( data['processed_content']) newdata["processed_title"] = self.del_unknowIndex( data['processed_title']) newdata["label"] = label new_datalist.append(newdata) return new_datalist def del_unknowIndex(self, enum): #过滤unknow的值 news_list = [] for x in enum: if x != 511001: news_list.append(x) return news_list def get_data(self): #获取所有的正例和负例 try: logging.info("get_data()") coll = self.mongoDB.dbConnection() positive_data = coll.find({ 'article_label': 1, 'article_label_state': { '$gte': 1 } }) pos_data = self.filter_data(positive_data, 1) pos_length = len(pos_data) logging.info("pos_length=[" + str(pos_length) + ']') print "positive data num:", pos_length if pos_length == 0: return [] negative_data = coll.find({ 'article_label': 0, 'article_label_state': { '$gte': 1 } }).limit(pos_length) neg_data = self.filter_data(negative_data, 0) self.mongoDB.dbClose() data = {} data['pos'] = pos_data data['neg'] = neg_data return data except BaseException, e: logging.error(e) return {}
class ArticleDAO(object): def __init__(self, label): self.mongoDB = MongoDBConnection(label) def article_search_list(self, search_condition): try: article_source = search_condition['article_source'] article_db = search_condition['article_db'] article_label_state = search_condition['article_label_state'] startTime = search_condition['startTime'] endTime = search_condition['endTime'] current_page = search_condition['current_page'] page_size = search_condition['page_size'] coll = self.mongoDB.dbConnection() condition = { "article_label_state": article_label_state, 'article_publish_time': { '$gte': startTime, '$lte': endTime } } if len(article_source) != 0: # 如果列表长度为0则查询所有网站新闻 condition['article_source'] = {'$in': article_source} if article_db == 1: condition['is_repeate'] = 0 if search_condition.has_key("update_student"): condition['update_student'] = search_condition[ 'update_student'] result = coll.find(condition).skip(page_size * current_page).limit(page_size) self.mongoDB.dbClose() article_list = [] for article in result: article['_id'] = str(article['_id']) article['id'] = article['_id'] article.pop('article_discuss') article.pop('processed_content') article.pop('processed_title') article.pop('article_attend_number') article.pop('article_discuss_number') article.pop('article_discuss') article['article_content'] = self._remove_htmlTags( article['article_content']) article['article_content'] = article['article_content'][0:240] article_list.append(article) return article_list except BaseException, e: logging.error(e) return None
def __init__(self,page_list = 100): self.obj = MongoDBConnection() self.temp_collection = self.obj.dbConnectionTemp() self.collection_postive = self.obj.dbConnection() self.collection_negative = self.obj.dbConnectionNegative() #self.last_objectid = None self.page_list = page_list self.stop = False self.dict_time = {} self.loop = 2 self.filename = db_settings.MONGO_LOG_FILE + str(datetime.date.today())
class TagsDAO(object): def __init__(self): self.mongoDB = MongoDBConnection('articles_tags') pass def addTag(self, tag): try: if self.find(tag): return True information = {"tag": tag} coll = self.mongoDB.dbConnection() coll.insert(information) self.mongoDB.dbClose() return True except BaseException, e: logging.error(e) return False
class SystemSetting: def __init__(self): self.mongoDB = MongoDBConnection("system_setting") def lock_model_training(self): # 模型训练的时候锁住,防止多次触发模型训练,导致内存、cpu占用过高 try: coll = self.mongoDB.dbConnection() result = coll.find_one({"key":"classifier_model_training_lock" }) if result is None: coll.insert({"key":"classifier_model_training_lock", "value":True}) else: coll.update({"key":"classifier_model_training_lock"}, \ {"$set":{"value":True}}) self.mongoDB.dbClose() return True except BaseException, e: logging.error(e) return False
class news_operation(operation): def __init__(self, re_spider_log, collection_name=""): self.obj = MongoDBConnection() operation.__init__(self, re_spider_log) self.stop = False def reset_para(self): self.stop = False def search_news_page( self, condition, page_number=10, page=0, ): try: if self.stop: return None self.collection_negative = self.obj.dbConnectionNegative() data = self.collection_negative.find( condition, { "_id": 1, "article_url": 1 }, no_cursor_timeout=True).hint([ ("article_publish_time", pymongo.ASCENDING) ]).skip(page_number * page).limit(page_number) if data.count(True) < page_number: self.stop = True return data except BaseException, error: self.write_log(error) return None finally:
def __init__(self, label): self.mongoDB = MongoDBConnection(label)
def __init__(self): self.mongoDB = MongoDBConnection('articles_tags') pass
def __init__(self): self.mongoDB = MongoDBConnection('user')
class UserDAO(object): def __init__(self): self.mongoDB = MongoDBConnection('user') def addUser(self, user): try: information = { "username": user.username, "password": user.password, "role": user.role } coll = self.mongoDB.dbConnection() coll.insert(information) self.mongoDB.dbClose() return True except: return False def isexist(self, user): coll = self.mongoDB.dbConnection() query_result = coll.find_one({"username": user.username}) self.mongoDB.dbClose() if query_result: return True else: return False def login(self, user): coll = self.mongoDB.dbConnection() query_result = coll.find_one({ "username": user.username, 'password': user.password }) print query_result self.mongoDB.dbClose() if query_result: query_result['_id'] = str(query_result['_id']) return query_result else: return None def userlist(self): coll = self.mongoDB.dbConnection() query_result = coll.find() self.mongoDB.dbClose() userList = [] for item in query_result: item['_id'] = str(item['_id']) item['id'] = item['_id'] #print item userList.append(item) return userList def deluser(self, userId): coll = self.mongoDB.dbConnection() query_result = coll.remove({"_id": ObjectId(userId)}) self.mongoDB.dbClose() return True
def __init__(self): self.mongoDB = MongoDBConnection("system_setting")
class ArticleDAO(object): def __init__(self, label): self.mongoDB = MongoDBConnection(label) def article_search_list(self, search_condition): try: logging.info("article_search_list()") article_source = search_condition['article_source'] article_db = search_condition['article_db'] article_label_state = search_condition['article_label_state'] startTime = search_condition['startTime'] endTime = search_condition['endTime'] current_page = search_condition['current_page'] page_size = search_condition['page_size'] article_label = search_condition['article_label'] timerange_check = search_condition["timerange_check"] search_type = search_condition["search_type"] tags = search_condition["tags"] coll = self.mongoDB.dbConnection() condition = {} if search_type == "filter_search": if timerange_check == 1: condition['article_publish_time'] = { '$gte': startTime, '$lte': endTime } if len(article_label_state) != 0: condition['article_label_state'] = { '$in': article_label_state } if len(article_label) != 0: condition["article_label"] = {'$in': article_label} if len(article_source) != 0: # 如果列表长度为0则查询所有网站新闻 condition['article_source'] = {'$in': article_source} if len(tags) != 0: orlist = [] for tag in tags: orlist.append({"tags": tag.strip()}) condition['$or'] = orlist if article_db == 1: condition['is_repeate'] = 0 if search_condition.has_key("update_student"): condition['update_student'] = search_condition[ 'update_student'] print "mongodb condition:", condition result = coll.find(condition).skip(page_size * current_page).limit(page_size) self.mongoDB.dbClose() article_list = [] for article in result: article['_id'] = str(article['_id']) article['id'] = article['_id'] article['article_content'] = self._remove_htmlTags( article['article_content']) article['article_content'] = article['article_content'][0:240] article_list.append(article) return article_list except BaseException, e: print e print traceback.print_exc() logging.error(e) return []