def connect_to_mysql(): conn = MySQLConnector() conn.open(self.mysql_connection['host'], self.mysql_connection['name'], self.mysql_connection['user'], self.mysql_connection['pass']) return conn
def OutterChecker(predictGroups, SD, ED, HRZID, Code, downGasAmount, upGasAmount): # 基于起始和终止日期,建立合适的时间索引和DataFrame validIndex = pd.date_range(SD, ED, freq='1min') validDataFrame = pd.DataFrame(index=validIndex) # 创建数据库连接 conn = MySQLConnector() conn.openConnector() # 查询该控制周期内燃气流量序列 SQL_Gas = u'SELECT create_time,瞬时流量 FROM bd_xinkou_2 WHERE create_time>"%s" and create_time<"%s"' % ( SD, ED) conn.cursor.execute(SQL_Gas) realGasSeries = np.array(conn.cursor.fetchall()) if len(realGasSeries) == 0: return 1, [1], np.random.randn(len(validIndex)), np.random.randn( len(validIndex)) realGasDataFrame = pd.DataFrame(np.array(realGasSeries[:, 1], dtype=np.float32), index=realGasSeries[:, 0], columns=[u'瞬时流量']) realGasDataFrame = realGasDataFrame.resample('1min').mean() validDataFrame[u'瞬时流量'] = realGasDataFrame[u'瞬时流量'] realGasMean = realGasSeries[:, 1].mean() # 查询换热站一次回温度 SQL_HFBT = u'SELECT create_time,一次回温度 FROM bd_xinkou_hrz WHERE project_sub_station_id=%d and code="%s" and create_time>"%s" and create_time<"%s"' % ( HRZID, Code, SD, ED) conn.cursor.execute(SQL_HFBT) realHFBTArray = np.array(conn.cursor.fetchall()) if len(realHFBTArray) == 0: return 1, [1], np.random.randn(len(validIndex)), np.random.randn( len(validIndex)) realHFBTSeries = pd.Series(np.array(realHFBTArray[:, 1], dtype=np.float32), index=realHFBTArray[:, 0], name=u'一次回温度') realHFBTSeries = realHFBTSeries.resample('1min').mean() validDataFrame[u'一次回温度'] = realHFBTSeries # 确定预测组的恰当索引 try: assert realGasMean > downGasAmount assert realGasMean < upGasAmount predictGroupsIndex = realGasMean // 100 - downGasAmount // 100 except AssertionError: if realGasMean <= downGasAmount: predictGroupsIndex = 0 else: predictGroupsIndex = -1 targetPredict = predictGroups[predictGroupsIndex] validDataFrame[u'预测回水温度'] = targetPredict validDataFrame = validDataFrame.fillna(method='ffill').fillna( method='bfill') score = evalueHFBT(validDataFrame[u'瞬时流量'].as_matrix().ravel(), validDataFrame[u'一次回温度'].as_matrix().ravel(), validDataFrame[u'预测回水温度'].as_matrix().ravel()) GasDis = [1] print u'在该控制周期内实际耗费燃气量为:%.1f' % realGasMean return score, GasDis, validDataFrame[u'一次回温度'].as_matrix().ravel( ), validDataFrame[u'预测回水温度'].as_matrix().ravel()
def configSQL(self): connector = MySQLConnector(LOGIN_INFO_PATH) self.sqlConn = connector.connect() # create database if not exists if not DATABASE_NAME in self.sqlConn.getDatabaseList(): self.sqlConn.createDatabase(DATABASE_NAME) db = self.sqlConn.getDatabase(DATABASE_NAME) # create tables if not exists if not SYMBOL_DETAIL_TABLE_NAME in db.getTableList(): db.createTable(SYMBOL_DETAIL_TABLE_NAME, SYMBOL_DETAIL_TABLE) db.commit() self.sqlConn.close()
def __init__(self, date): self.date = date self.connector = MySQLConnector() self.logger = logging.getLogger('analyzer') self.logger.setLevel(level=logging.DEBUG) ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') ch.setFormatter(formatter) self.logger.addHandler(ch) self.logger.info("Finished initialization.")
def updateHistData(symbols: list): connector = MySQLConnector(LOGIN_INFO_PATH) sqlConn = connector.connect() saveCount = 0 for i in range(len(symbols)): print("updating... {}/{}".format(i + 1, len(symbols))) symbol = symbols[i] yf = yahoofinance.YFData(symbol) updateHistPrice(sqlConn, yf, symbol) updateDividend(sqlConn, yf, symbol) updateStocksplit(sqlConn, yf, symbol) checkAdjclose(sqlConn, yf, symbol) saveCount += 1 if saveCount == 100: sqlConn.commit() saveCount = 0 sqlConn.commit() sqlConn.close()
def getSymbolList() -> list: connector = MySQLConnector(LOGIN_INFO_PATH) sqlConn = connector.connect() sql = """ SELECT `symbols`.`symbols`.`symbol`, `symbols`.`symbols`.`enable`, `symbols`.`symbol_details`.`marketCap` FROM `symbols`.`symbols` INNER JOIN `symbols`.`symbol_details` ON `symbols`.`symbols`.`symbol`=`symbols`.`symbol_details`.`symbol` WHERE `symbols`.`symbols`.`enable` = 1 and `symbols`.`symbol_details`.`marketCap` > 0; """ data = sqlConn.execute(sql) symbols = data["symbol"] sql = """ SELECT `symbol` FROM `symbols`.`symbol_details` WHERE `quoteType` = "INDEX"; """ data = sqlConn.execute(sql) symbols.extend(data["symbol"]) sqlConn.close() return symbols
def __init__(self, last_date=None, this_date=None): self.connector = MySQLConnector() self.logger = logging.getLogger('crawler') self.logger.setLevel(level=logging.DEBUG) ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) # 输出到console的log等级的开关 formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') ch.setFormatter(formatter) self.logger.addHandler(ch) # load media list if last_date == None: last_date = self.get_last_date() self.last_date = datetime.datetime.strptime(last_date, '%Y%m%d').replace(tzinfo=pytz.utc) if this_date != None: self.today = this_date else: self.today = time.strftime("%Y%m%d", time.localtime()) self.logger.info("Finished initialization.")
def __init__(self, date): self.date = datetime.datetime.strptime(date, '%Y%m%d') self.connector = MySQLConnector() leancloud.init("U83hlMObhFRFRS4kX3lOxSlq-gzGzoHsz", "Jw2Y6KFFsjI5kEz1qYqQ62da") logging.basicConfig(level=logging.DEBUG) self.logger = logging.getLogger('analyzer') self.logger.setLevel(level=logging.DEBUG) ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') ch.setFormatter(formatter) self.logger.addHandler(ch) self.logger.info("Finished initialization.")
def save_to_trips(registers): """ Saves a list of registers to local>taxi>trips Input: registers: list of suitable dictionaries Output: None """ # Create MySQL connection cnx = MySQLConnector() # Insert registers into trips table for reg in registers: cnx.insert(reg) # Close connection cnx.close()
class Analyzer: def __init__(self, date): self.date = date self.connector = MySQLConnector() self.logger = logging.getLogger('analyzer') self.logger.setLevel(level=logging.DEBUG) ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') ch.setFormatter(formatter) self.logger.addHandler(ch) self.logger.info("Finished initialization.") def get_sentiment_score(self, title, text): sent_analyzer = SentimentIntensityAnalyzer() title_score = sent_analyzer.polarity_scores(title)['compound'] title_score = int(round((title_score + 1) * 10)) sents = sent_tokenize(text) scores = [] for sent in sents: if (len(word_tokenize(sent)) > 1): # 句子中单词大于1个 score = sent_analyzer.polarity_scores(sent)['compound'] scores.append(score) text_score = np.mean(scores) text_score = int(round((text_score + 1) * 40)) final_score = title_score + text_score return final_score def preprocess_text(self, text): text = text.replace("'", "''") return text def get_summary(self, title, text): article = Article(url='') article.title = title article.text = text article.download_state = ArticleDownloadState.SUCCESS article.is_parsed = True article.nlp() return self.preprocess_text(article.summary) def analyze(self): # load articles cursor = self.connector.connect() sql = "SELECT articleIndex, title, text FROM news.article WHERE downloadDate=%s and groupIndex is not null" % self.date # sql = "SELECT articleIndex, title, text FROM news.article WHERE groupIndex is not null" try: cursor.execute(sql) self.articles = cursor.fetchall() # print(self.articles) except: self.logger.info("Error: unable to fecth data") for article in self.articles: # generate summary summary = self.get_summary(article[1], article[2]) # sentiment analysis sentiment_score = self.get_sentiment_score(article[1], article[2]) # print(article[0] + '\t' + article[1] +'\t'+ str(sentiment_score)) # upload to DB sql = "UPDATE news.article SET sentimentScore='%d',summary='%s' WHERE articleIndex = '%s'" % ( sentiment_score, summary, article[0]) #sql = "UPDATE news.article SET sentimentScore='%d' WHERE articleIndex = '%s'" % (sentiment_score, article[0]) try: cursor.execute(sql) self.connector.db.commit() self.logger.info( "Successfully upload article %s with score=%d", article[0], sentiment_score) except: self.logger.info("Unable to update summary and score!") self.connector.db.rollback() self.connector.disconnect() # analyzer = Analyzer(date='20200801') # analyzer.analyze()
class Crawler: def __init__(self, last_date=None, this_date=None): self.connector = MySQLConnector() self.logger = logging.getLogger('crawler') self.logger.setLevel(level=logging.DEBUG) ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) # 输出到console的log等级的开关 formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') ch.setFormatter(formatter) self.logger.addHandler(ch) # load media list if last_date == None: last_date = self.get_last_date() self.last_date = datetime.datetime.strptime(last_date, '%Y%m%d').replace(tzinfo=pytz.utc) if this_date != None: self.today = this_date else: self.today = time.strftime("%Y%m%d", time.localtime()) self.logger.info("Finished initialization.") def if_covid19_related(self, text): ''' 判断文章是否包含疫情关键词 ''' text_lower = text.lower() for kw in covid_19_keywords: if kw in text_lower: return True return False def if_url_satisfied(self, media, url): flag = False for must in media['Must']: if must in url: flag = True if not flag: return False if media['MustNot'] != None: for mustnot in media['MustNot']: if mustnot in url: return False return True def preprocess_text(self, text): text = text.replace("'", "''") return text def crawl(self): ''' 爬新的文章(上次抓取时间之后发布的) ''' self.cursor = self.connector.connect() cur_count = self.get_cur_count() for media in media_list: urls = media['URL'] for idx, url in enumerate(urls): self.logger.info("Start crawl %s - %d ..." % (media['Name'], idx)) media_website = newspaper.build(url,language='en', memoize_articles=True) sum = len(media_website.articles) self.logger.info("Found %d articles" % sum ) for i, article in enumerate(media_website.articles): print(i, article.url) if not self.if_url_satisfied(media, article.url): # 网址前缀必须符合给定分类网址 continue # print(article.url) try: article.download() print("...download") article.parse() print("...parsed") except: continue # 文章过老 if article.publish_date == None: article.publish_date = datetime.datetime.now() article.publish_date = article.publish_date.replace(tzinfo=pytz.utc) if article.publish_date < self.last_date: continue # 文章与疫情无关 if not (self.if_covid19_related(article.title) or self.if_covid19_related(article.text)): continue # 上传到MySQL数据库 # article.nlp() self.upload_article(cur_count, media['Index'], article) self.logger.info("Successfuly download %d / %d th article %s \n %s" % (i, sum, article.url, article.title)) cur_count += 1 self.connector.disconnect() def upload_article(self, count, media_index, article): #self.cursor = self.connector.connect() idArticle = '%s%05d' % (self.today, count) title = self.preprocess_text(article.title) url = self.preprocess_text(article.url) # keywords = str(article.keywords).replace("'", '"') # summary = self.preprocess_text(article.summary) text = self.preprocess_text(article.text) sql = """INSERT INTO `news`.`article` (`articleIndex`, `title`, `mediaIndex`, `publishDate`, `downloadDate`, `image`,`url`, `text`) VALUES ('%s', '%s', '%d', '%s','%s','%s', '%s', '%s'); """ % ( idArticle, title, media_index, article.publish_date.strftime("%Y-%m-%d %H:%M:%S"), self.today, article.top_image, url, text) try: self.cursor.execute(sql) self.connector.db.commit() except MySQLdb._exceptions.OperationalError: self.connector.db.rollback() self.logger.info("Unable to upload article to DB!") # self.connector.disconnect() def get_cur_count(self): # self.cursor = self.connector.connect() sql = '''select MAX(articleIndex) from news.article where articleIndex REGEXP '%s';''' % (self.today) try: self.cursor.execute(sql) max_idx = self.cursor.fetchall() except: self.logger.info("Unable to get current count!") #print(max_idx) #self.connector.disconnect() if max_idx[0][0] is None: count = 0 else: count = int(max_idx[0][0][-5:]) + 1 self.logger.info("Successfully get current count = %d ." % count) return count def get_last_date(self): self.cursor = self.connector.connect() sql = '''SELECT max(downloadDate) FROM news.article''' try: self.cursor.execute(sql) fetch_result = self.cursor.fetchall() except: self.logger.info("Unable to get last date!") self.connector.disconnect() if fetch_result[0][0] is None: last_date = "20200101" else: last_date = datetime.date.strftime(fetch_result[0][0],'%Y%m%d') print(last_date) self.logger.info("Successfully get lastdate '%s'." % last_date) return last_date # crawler = Crawler(last_date = '20200708', this_date='20200709') # crawler = Crawler() #crawler.crawl()
class Uploader: def __init__(self, date): self.date = date self.articles = [] self.connector = MySQLConnector() leancloud.init("U83hlMObhFRFRS4kX3lOxSlq-gzGzoHsz", "Jw2Y6KFFsjI5kEz1qYqQ62da") logging.basicConfig(level=logging.DEBUG) def upload_to_old_groups(self): '''如果新下载的新闻被分类到旧的新闻组,直接上传''' pass def upload_new_groups(self): '''上传新的新闻组''' cursor = self.connector.connect() sql = "SELECT articleIndex,groupIndex,title,publishDate,image,url,mediaObjId,sentimentScore,summary \ FROM news.article INNER JOIN news.media ON article.mediaIndex = media.index \ where groupIndex is REGEXP '%s'" % self.date try: cursor.execute(sql) articles = cursor.fetchall() # print(self.articles) except: print("Error: unable to fecth data") self.connector.disconnect() self.groups = {} for article in articles: groupIndex = article[1] if groupIndex not in self.groups: self.groups[groupIndex] = [] self.groups[groupIndex].append(article) for groupIndex in self.groups: # upload articles article_obj_ids = [] group_img = None for article in self.groups[groupIndex]: Article = leancloud.Object.extend('Article') Media = leancloud.Object.extend('Media') article_obj = Article() article_obj.set('ArticleIndex', article[0]) article_obj.set('GroupIndex', article[1]) article_obj.set('Title', article[2]) article_obj.set('Date', article[3]) if article[4] != '' : article_obj.set('ImageURL', article[4]) group_img = article[4] article_obj.set('Link', article[5]) article_obj.set('Media', Media.create_without_data(article[6])) # Pointer article_obj.set('SentimentScore', article[7]) article_obj.set('Summary', article[8]) article_obj.save() article_obj_ids.append(article_obj.id) # upload group info rank_score = self.cal_group_rank_score(self.groups[groupIndex]) # print(groupIndex + '\t' + str(rank_score)) # print(groupIndex, rank_score) NewsGroup = leancloud.Object.extend('NewsGroup') group_obj = NewsGroup() group_obj.set('Title', self.groups[groupIndex][0][2]) # 第一篇文章的title作为group title group_obj.set('GroupIndex', groupIndex) group_obj.set('RankScore', rank_score) if group_img is not None: group_obj.set('ImageURL', group_img) group_obj.set('Articles',(article_obj_ids)) group_obj.save() def cal_group_rank_score(self, articles): article_num = len(articles) if article_num == 3: num_score = 2 elif 4 <= article_num <= 8: num_score = 3 else: num_score = 1 senti_scores = [] media_set = set() for article in articles: senti_scores.append(article[7]) media_set.add(article[6]) media_num = len(media_set) if media_num == 1: media_score = 0 elif media_num == 2: media_score = 1 elif media_num <= 4: media_score = 2 else: media_score = 3 senti_diff = int(np.max(senti_scores) - np.min(senti_scores)) return num_score + media_score + senti_diff
def readData(self): # 建立对数据库的连接 conn = MySQLConnector() conn.openConnector() # ++++++++++++++ 查询锅炉状态 +++++++++++++++++++++++++++++++++++ # 从SQL语句中分析所需要抓取的字段名模式 columnsPattern = re.compile("SELECT DISTINCT (.*?) FROM") columnsStr = re.findall(columnsPattern, self._QueryBoilerState)[0] columns = columnsStr.strip().split(',') columns = [(c.strip())[2:] for c in columns] # 对字段名进行修正 if len(self._BoilerColumnsMap()) != 0: columns = [self._BoilerColumnsMap()[c] for c in columns] # 游标执行查询锅炉状态数据的语句 conn.cursor.execute(self._QueryBoilerState) tempBoilerState = np.array(conn.cursor.fetchall()) # 对该数据集中日期时间进行修正 modifiedTime = [tools_DateTimeTrans(i) for i in tempBoilerState[:, 0]] BoilerData = pd.DataFrame(np.array(tempBoilerState[:, 1:], np.float32), index=modifiedTime, columns=columns[1:]) # ++++++++++++++ 查询锅炉状态 +++++++++++++++++++++++++++++++++++ # ++++++++++++++ 查询天气状态 +++++++++++++++++++++++++++++++++++ # 从SQL语句中分析所需要抓取的字段名模式 columnsStr = re.findall(columnsPattern, self._QueryWeatherState)[0] columns = columnsStr.strip().split(',') columns = [c.strip()[2:] for c in columns] # 对字段名进行修正 if len(self._WeatherColumnsMap()) != 0: columns = [self._WeatherColumnsMap()[c] for c in columns] # 游标执行查询天气状态数据的语句 conn.cursor.execute(self._QueryWeatherState) tempWeatherState = np.array(conn.cursor.fetchall()) modifiedTime = [tools_DateTimeTrans(i) for i in tempWeatherState[:, 0]] WeatherData = pd.DataFrame(np.array(tempWeatherState[:, 1:], np.float32), index=modifiedTime, columns=columns[1:]) # ++++++++++++++ 查询天气状态 +++++++++++++++++++++++++++++++++++ # ++++++++++++++ 查询室内状态 +++++++++++++++++++++++++++++++++++ dev_ids = self._CollectInHomeDeviceID() dev_DataFrames = [] for k, dev_id in enumerate(dev_ids): # 生成每一个设备对应的SQL语句 individualDevSQL = self._QueryInHomeState % (dev_id) # 从SQL语句中抽取所抽取的字段 columnsStr = re.findall(columnsPattern, individualDevSQL)[0] # 对字段名称进行修正 columns = columnsStr.strip().split(',') columns = [c.strip()[2:] for c in columns] if len(self._InHomeColumnsMap()) != 0: columns = [self._InHomeColumnsMap()[c] for c in columns] columns = [u'%d#传感器%s' % (k, c) for c in columns] # 游标执行查询室内状态数据的语句 conn.cursor.execute(individualDevSQL) tempInHomeState = np.array(conn.cursor.fetchall()) try: modeifiedTime = [ tools_DateTimeTrans(i) for i in tempInHomeState[:, 0] ] InHomeData = pd.DataFrame(np.array(tempInHomeState[:, 1:], np.float32), index=modeifiedTime, columns=columns[1:]) dev_DataFrames.append(InHomeData) except IndexError: print u'%d#室内传感器无数据' % k continue # ++++++++++++++ 查询室内状态 +++++++++++++++++++++++++++++++++++ # ++++++++++++++ 查询换热站状态 +++++++++++++++++++++++++++++++++ if self._QueryHRZState != False: # 从SQL语句中分析所需要抓取的字段名模式 columnsStr = re.findall(columnsPattern, self._QueryHRZState)[0] columns = columnsStr.strip().split(',') columns = [c.strip()[2:] for c in columns] # 对字段名进行修正 if len(self._HRZColumnsMap()) != 0: columns = [self._HRZColumnsMap()[c] for c in columns] # 游标执行查询天气状态数据的语句 conn.cursor.execute(self._QueryHRZState) tempHRZState = np.array(conn.cursor.fetchall()) modifiedTime = [tools_DateTimeTrans(i) for i in tempHRZState[:, 0]] HRZData = pd.DataFrame(np.array(tempHRZState[:, 1:], np.float32), index=modifiedTime, columns=columns[1:]) else: HRZData = pd.DataFrame() totalData = self.concatData(BoilerData, WeatherData, dev_DataFrames, HRZData) return totalData
import argparse from MySQLConnector import MySQLConnector if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-host', type=str, help='host ip/domain') parser.add_argument('-id', type=str, help='id') parser.add_argument('-db', type=str, help='Database name') parser.add_argument('-verbose', action='store_true', default=False, help='verbose option') args = parser.parse_args() conn = MySQLConnector(args.host, args.id, args.db, verbose=args.verbose) while True: print( "====================================================================" ) print( "q : query, qs : query with string, gs : get settings, p : print history, c : clear history" ) print( "gh : get history, ghd : get history dict, ec : export csv, ep : export pickle, ri : remove invalid" ) print("quit : quit") a = input("opcode > ") print( "--------------------------------------------------------------------"
def open_dbconnection(self): """ Create and return a MySQL connection object """ conn = MySQLConnector() conn.open(self.mysql_connection['host'], self.mysql_connection['name'], self.mysql_connection['user'], self.mysql_connection['pass']) return conn
def __init__(self, date): self.date = date self.articles = [] self.connector = MySQLConnector() leancloud.init("U83hlMObhFRFRS4kX3lOxSlq-gzGzoHsz", "Jw2Y6KFFsjI5kEz1qYqQ62da") logging.basicConfig(level=logging.DEBUG)
from MySQLConnector import MySQLConnector if __name__=="__main__": print("Hello world!") print(MySQLConnector().connect())
class Cluster(): def __init__(self, date): self.date = date self.connector = MySQLConnector() self.logger = logging.getLogger('cluster') self.logger.setLevel(level=logging.DEBUG) ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) # 输出到console的log等级的开关 formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') ch.setFormatter(formatter) self.logger.addHandler(ch) self.logger.info("Finished initialization.") def remove_punctuation(self, text): text = re.sub(r'[{}]+'.format(punctuation), '', text) return text.strip().lower() def load_articles(self): '''从数据库中获取未分类的数据''' cursor = self.connector.connect() sql = "SELECT articleIndex, title, text, mediaIndex FROM news.article WHERE downloadDate='%s'" % ( self.date) # sql = '''SELECT articleIndex, title, text, mediaIndex FROM news.article where downloadDate in ('20200728', '20200729', '20200731', '20200801')''' try: cursor.execute(sql) self.articles = cursor.fetchall() self.logger.info("Successfully loaded %d articles." % len(self.articles)) except: self.logger.info("Error: unable to fecth data") self.connector.disconnect() def remove_useless_articles(self): self.load_articles() useless_articles = [] media_articles = {} for article in self.articles: media = article[3] if media not in media_articles: media_articles[media] = set() title = article[1] if title in media_articles[media]: useless_articles.append(article[0]) #去重 self.logger.info("Found repeated article %s" % (article[0])) continue else: media_articles[media].add(title) media_articles[article[3]].add(article[1]) if detect(article[1]) != 'en': # print(article[1]) useless_articles.append(article[0]) self.logger.info("Found article %s not in English" % (article[0])) else: list_words = word_tokenize(self.remove_punctuation(article[2])) if len(list_words) < 30: useless_articles.append(article[0]) self.logger.info("Found article %s too short" % (article[0])) # print(useless_articles) cursor = self.connector.connect() for articleID in useless_articles: sql = "DELETE FROM `news`.`article` WHERE(`articleIndex` = %s)" % ( articleID) try: cursor.execute(sql) self.connector.db.commit() except: self.connector.db.rollback() self.logger.info("Unable to delete useless articles") self.connector.disconnect() self.logger.info("Successfully removed %d articles" % len(useless_articles)) def encode_text(self, encode_obj='title', vector='tf-idf'): ''' 多种方式编码文本 ''' if vector == 'count': vectorizer = CountVectorizer(stop_words='english', lowercase=True, binary=True, tokenizer=myTokenizer) elif vector == 'tf-idf': vectorizer = TfidfVectorizer(stop_words='english', lowercase=True, tokenizer=myTokenizer) corpus = [] if encode_obj == 'title': # 标题 idx = 1 # elif encode_obj == 'summary': # 摘要 # idx = 4 elif encode_obj == 'text': # 全文 idx = 2 for article in self.articles: corpus.append(article[idx]) X = vectorizer.fit_transform(corpus) #print(len(vectorizer.get_feature_names())) # print (vectorizer.get_feature_names()) return X def cluster(self): X = self.encode_text(encode_obj='text', vector='tf-idf') db_model = DBSCAN(eps=0.4, min_samples=3, metric='cosine').fit(X) joblib.dump(db_model, './models/model_%s.pkl' % self.date) self.group_list = db_model.labels_ # 打印分组结果 self.logger.info("Cluster results: %d groups " % np.max(self.group_list)) # for i in range(len(self.group_list)): # if self.group_list[i] != -1: # print(str(self.group_list[i])+'\t'+str(self.articles[i][3])+'\t'+self.articles[i][1]) group_result = {} for i, label in enumerate(self.group_list): if label not in group_result: group_result[label] = [] group_result[label].append(self.articles[i][1]) for i in range(0, np.max(self.group_list) + 1): print(i) print(group_result[i]) def upload_groups_to_DB(self): cursor = self.connector.connect() for i, label in enumerate(self.group_list): if label == -1: continue sql = "UPDATE news.article SET groupIndex='%s%03d' WHERE articleIndex = '%s'" % ( self.date, label, self.articles[i][0]) #print(sql) try: cursor.execute(sql) self.connector.db.commit() except: # 发生错误时回滚 self.logger.info("Unable to update group index!") self.connector.db.rollback() self.connector.disconnect() self.logger.info("Successfully update group indices.")