def get_summary_corpus(self, start_time, end_time): """ 获得划线模型语料 :param start_time: :param end_time: :return: """ sql = """ SELECT bd.id, bd.publishtime, bd.site_name, nar.abstract FROM news_abstract_result nar, base_data bd WHERE nar.bd_id = bd.id AND bd.publishtime >= '%s' AND bd.publishtime < '%s' AND nar.language_type = 1 AND nar.is_lined = 1""" % (start_time, end_time) records = self.fetch_all(sql) data = [] for row in records: id = row['id'] abstract = row['abstract'] abstract = abstract.encode('utf-8') publish_time = row['publishtime'].strftime( "%Y-%m-%d %H:%M:%S").decode('utf-8') site_name = row['site_name'] site_name = site_name.encode('utf-8') cluster_obj = ClusterMessageObj(id, abstract, publish_time, '', site_name) data.append(cluster_obj) return data
def parse_corpus_records(records, language_type=None): data = [] for row in records: id = row['id'] title = row['title'] title = title.encode('utf-8') content = row['content'] content = content.encode('utf-8') publish_time = row['publishtime'].strftime( "%Y-%m-%d %H:%M:%S").decode('utf-8') site_name = row['site_name'] site_name = site_name.encode('utf-8') cluster_obj = ClusterMessageObj(id, title, publish_time, content, site_name) # 进行语言过滤 if language_type is not None: if BaseDataView.is_valid_language(language_type, content): data.append(cluster_obj) else: data.append(cluster_obj) return data
def parse_corpus_records(records): data = [] for row in records: id = row['id'] title = row['title'] title = title.encode('utf-8') content = row['content'] content = content.encode('utf-8') publish_time = row['publishtime'].strftime( "%Y-%m-%d %H:%M:%S").decode('utf-8') site_name = row['site_name'] site_name = site_name.encode('utf-8') cluster_obj = ClusterMessageObj(id, title, publish_time, content, site_name) data.append(cluster_obj) return data
def query_string(querystring): # 请求uri前缀 cMList = [] prefix_req_uri = "http://saas1:5000/enterprise_saas_platform/saas_platform/" + querystring + "/" suffix_uri_total = "display=id/1/1" start_time = datetime.now() logger.info('starting query_string, {prefix_req_uri: %s}' % (prefix_req_uri, start_time.strftime('%Y-%m-%d %H:%M:%S'))) # 获得总数 totalUri = prefix_req_uri + suffix_uri_total totalResp = requests.get(totalUri) total = json.loads(totalResp.text)["total"] # 每次请求条数 pageNum = 100 # 遍历, 获得数据 display_uri_data = "display=id&title&pubtime/" for idx in range(0, total, pageNum): page_uri_data = str(idx) + "/" + str(pageNum) dataUri = prefix_req_uri + display_uri_data + page_uri_data dataResp = requests.get(dataUri) # 遍历数据, 保存到list列表中 for mObj in json.loads(dataResp.text)["doc"]: id = mObj["id"] title = mObj["title"] publishtime = mObj["pubtime"] cMList.append( ClusterMessageObj(messageId=id, messageTitle=title, messagePublishtime=publishtime)) # 返回数据 logger.info( 'end query_string: {prefix_req_uri: %s, total: %d, lost_seconds: %ds}' % (prefix_req_uri, total, (datetime.now() - start_time).seconds)) return cMList
def dic_clusterobj(dic): msg = ClusterMessageObj() msg.__dict__ = dic return msg
def get_involved_china_corpus(start_time, end_time, language_type, group_id): """ 获得聚类语料 :param start_time: :param end_time: :param group_id: :return: """ data = [] start_timestamp = time.time() # SQL 查询语句, 内容只取第一段内容 sql = """ SELECT id, title, content, publishtime, site_name FROM base_data_view WHERE publishtime >= '%s' AND publishtime < '%s' AND language_type = %s AND group_id IN %s AND involved_china = 1 group by titlehash """ % (start_time, end_time, language_type, group_id) logger.debug("starting get_involved_china_corpus, {sql: %s}." % sql) # 打开数据库连接 内网: 10.30.248.210 外网: 47.93.162.134 # 47.93.162.134 db = MySQLdb.Connection(host='10.30.248.210', user='******', passwd='Wi$eWeb123', db='wjbdb', charset='utf8', port=5720) # 使用cursor()方法获取操作游标 cursor = db.cursor() try: # 执行SQL语句 cursor.execute(sql) # 获取所有记录列表 for row in cursor: id = row[0] title = row[1] try: title = title.encode('utf-8') except Exception as e: logger.debug("Error: title.encode('utf-8'), {exception: %s}" % e) content = row[2] try: content = content.encode('utf-8') except Exception as e: logger.debug( "Error: content.encode('utf-8'), {exception: %s}" % e) publish_time = row[3].strftime("%Y-%m-%d %H:%M:%S").decode('utf-8') site_name = row[4] try: site_name = site_name.encode('utf-8') except Exception as e: logger.debug( "Error: site_name.encode('utf-8'), {exception: %s}" % e) cluster_obj = ClusterMessageObj(id, title, publish_time, content, site_name) # 加入结果 data.append(cluster_obj) logger.debug( "ending get_involved_china_corpus, {data length: %s, cost_times: %ds}" % (len(data), time.time() - start_timestamp)) except Exception as e: logger.error("Error: get_involved_china_corpus, {exception: %s}" % e) finally: # 关闭数据库连接 cursor.close() db.close() return data