def __init__(self, pred_config: Pred_config, keyword=None, contents_id=None): self.pred_config = pred_config self.engine = create_engine( ("mysql+pymysql://{}:{}@{}:{}/{}?charset=utf8mb4").format( 'root', 'robot369', '1.221.75.76', 3306, 'datacast2')) self.args = self.pred_config.get_args() ##쿠다, cpu 중 사용할 디바이스 설정 self.device = self.pred_config.get_device() ##배치사이즈 설정(모델이 한번에 처리할 input 사이즈 크기) self.batch_size = self.pred_config.batch_size ##모델 가져오기 self.model = self.pred_config.load_model(self.args, self.device) ##토크나이저 가져오기 self.tokenizer = self.pred_config.load_tokenizer() self.nlp = Mecab() self.keyword = keyword self.contents_id = contents_id self.db = Sql("datacast2")
def __init__(self,keyword,channel,contents_id): self.engine = create_engine(("mysql+pymysql://{}:{}@{}:{}/{}?charset=utf8mb4").format('root','robot369', '1.221.75.76',3306,'datacast2')) self.db = Sql("datacast2") self.keyword = keyword self.channel = channel self.contents_id = contents_id
class Predict: def __init__(self, pred_config:Pred_config,keyword=None,contents_id=None): self.pred_config = pred_config self.engine = create_engine(("mysql+pymysql://{}:{}@{}:{}/{}?charset=utf8mb4").format('root','robot369', '10.96.5.179',3306,'datacast2')) self.args = self.pred_config.get_args() ##쿠다, cpu 중 사용할 디바이스 설정 self.device = self.pred_config.get_device() ##배치사이즈 설정(모델이 한번에 처리할 input 사이즈 크기) self.batch_size = self.pred_config.batch_size ##모델 가져오기 self.model = self.pred_config.load_model(self.args, self.device) ##토크나이저 가져오기 self.tokenizer = self.pred_config.load_tokenizer() self.nlp = Mecab() self.keyword = keyword self.contents_id = contents_id self.db = Sql("datacast2") def verbs(self,phrase): """Verbs extractor.""" verbs = ['VV'] tagged = self.nlp.pos(phrase) return [s for s, t in tagged if t in verbs] def adjs(self,phrase): """Adjs extractor.""" adjs = ['VA','IC'] tagged = self.nlp.pos(phrase) return [s for s, t in tagged if t in adjs] def read(self): # conn = pymysql.connect(host='1.221.75.76', user='******', password='******', database='datacast') # curs = conn.cursor(pymysql.cursors.DictCursor) # sql_select_sentence = 'select * from analysis_sentence' # curs.execute(sql_select_sentence) # rows = curs.fetchall() ##pandas datatable 형태로 sentece 테이블 읽어들이기 print('sql:',"SELECT ct.channel,cc.contents_id,cs.text from crawl_task as ct join crawl_contents as cc on ct.task_id=cc.task_id JOIN crawl_sentence AS cs ON cs.contents_id = cc.contents_id " "WHERE cc.contents_id=\'%s\' and ct.keyword=\'%s\'" % (self.contents_id ,self.keyword)) # df_sentence_rows = pd.read_sql("SELECT ct.task_id,ct.channel,cc.contents_id,cc.text,cc.url from crawl_task as ct join crawl_contents as cc on ct.task_id=cc.task_id WHERE ct.keyword=\'%s\' limit %d,%d;"%(self.keyword,start_num,chunk_size),self.engine) df_sentence_rows = pd.read_sql( "SELECT ct.keyword,ct.channel,cc.contents_id as contents_id,cs.sentence_id as sentence_id, cs.text as sentence from crawl_task as ct join crawl_contents as cc on ct.task_id=cc.task_id JOIN crawl_sentence AS cs ON cs.contents_id = cc.contents_id " "WHERE cc.contents_id=\'%s\' and ct.keyword=\'%s\'" % ( self.contents_id,self.keyword), self.engine) return df_sentence_rows def convert_input_sentence_to_tensor_dataset(self,df_sentence_rows,cls_token_segment_id=0, pad_token_segment_id=0, sequence_a_segment_id=0, mask_padding_with_zero=True): tokenizer = self.tokenizer args = self.args cls_token = tokenizer.cls_token sep_token = tokenizer.sep_token pad_token_id = tokenizer.pad_token_id all_input_ids = [] all_attention_mask = [] all_token_type_ids = [] ###input file 읽어들이기 ###input file 읽어서 tensordata type 으로 변환 for index in df_sentence_rows.index: sentence = df_sentence_rows.at[index, 'sentence'] tokens = tokenizer.tokenize(sentence) # Account for [CLS] and [SEP] special_tokens_count = 2 #문장의 최대길이 보다 큰 문장에 대해서 길이 조정을 해준다. if len(tokens) > args.max_seq_len - special_tokens_count: tokens = tokens[:(args.max_seq_len - special_tokens_count)] # Add [SEP] token tokens += [sep_token] token_type_ids = [sequence_a_segment_id] *len(tokens) # Add [CLS] token tokens = [cls_token] + tokens token_type_ids = [cls_token_segment_id] + token_type_ids input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 real tokens and 0 for padding tokens. Only real tokens are attended to. attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) # Zero-pad up to the sequence length. padding_length = args.max_seq_len - len(input_ids) input_ids = input_ids+([pad_token_id] * padding_length) attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length) token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length) all_input_ids.append(input_ids) all_attention_mask.append(attention_mask) all_token_type_ids.append(token_type_ids) # Change to Tensor all_input_ids = torch.tensor(all_input_ids, dtype=torch.long) all_attention_mask = torch.tensor(all_attention_mask, dtype=torch.long) all_token_type_ids = torch.tensor(all_token_type_ids, dtype=torch.long) dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids) return dataset def predict(self): ##tuning 시 파라미터 정보가 들어있는 파일(training_args.bin) args = self.args ##쿠다, cpu 중 사용할 디바이스 설정 device = self.device ##배치사이즈 설정(모델이 한번에 처리할 input 사이즈 크기) batch_size= self.batch_size ##모델 가져오기 model = self.model logger.info(args) ##감성분석할 데이터 가져오기 df_sentence_data_rows = self.read() dataset = self.convert_input_sentence_to_tensor_dataset(df_sentence_data_rows) # dataset 을 model 을 이용하여 output 도출 # Predict sampler = SequentialSampler(dataset) data_loader = DataLoader(dataset, sampler=sampler, batch_size=batch_size) preds = None probs = None print(type(data_loader),len(data_loader)) for index,batch in enumerate(tqdm(data_loader, desc="Prediction")): batch = tuple(t.to(device) for t in batch) with torch.no_grad(): inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": None} if args.model_type != "distilkobert": inputs["token_type_ids"] = batch[2] outputs = model(**inputs) logits = outputs[0] if preds is None: preds = logits.detach().cpu().numpy() probs = np.exp(logits.detach().cpu().numpy())/ (1 + np.exp(logits.detach().cpu().numpy())) else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) probs = np.append(probs, np.exp(logits.detach().cpu().numpy())/ (1 + np.exp(logits.detach().cpu().numpy())), axis=0) preds = np.argmax(preds, axis=1).tolist() prob_max_index = np.argmax(probs, axis=-1) maximum_probs = probs[np.arange(probs.shape[0]), prob_max_index] # maximum_probs = maximum_probs.tolist() # maximum_probs = list([round(maximum_prob,2) if pred==1 else round(maximum_prob,2)*(-1) for pred,maximum_prob in zip(preds,maximum_probs)]) df_sentence_data_rows['positiveness'] = preds #update for idx in tqdm(df_sentence_data_rows.index,desc="sentence_anlysis&db_update"): try: sentence_id = df_sentence_data_rows.at[idx,'sentence_id'] sentence = df_sentence_data_rows.at[idx,'sentence'] positiveness = df_sentence_data_rows.at[idx,'positiveness'] nouns = list(set(self.nlp.nouns(sentence))) nouns = json.dumps(nouns,ensure_ascii=False) verbs = list(set(self.verbs(sentence))) verbs = json.dumps(verbs,ensure_ascii=False) adjs = list(set(self.adjs(sentence))) adjs = json.dumps(adjs,ensure_ascii=False) self.db.update_multi_column("crawl_sentence", update_dict={"nouns":nouns,"verbs":verbs,"adjs":adjs,"positiveness":float(positiveness)}, where_dict={"sentence_id":float(sentence_id)}) except Exception as e: print(e) continue
from predict_execution_en_review import * from db.almaden import Sql db = Sql("datacast2") contents_row = db.select('crawling_status_youtube_view','*','contents_status="GF"') ## pred_confing ##predict for row in contents_row: keyword= row['keyword'] contents_id = row['contents_id'] n_reply_crawled = row['n_reply_crawled'] if n_reply_crawled is not None and n_reply_crawled > 0: db.update_one('crawl_contents','crawl_status','SI','contents_id',contents_id) obj_predict = Predict(keyword=row['keyword'], channel='youtube', contents_id=contents_id) obj_predict.predict() else: task_ids = db.update_one('crawl_contents', 'crawl_status', 'SF', 'contents_id', contents_id) pass db = Sql("datacast2") db.update_one('crawl_contents', 'crawl_status', 'SF', 'contents_id', contents_id)
from predict_execution_blog import * from db.almaden import Sql db = Sql("datacast2") blog_channel_list = str(('naverblog', 'instagram', 'GooglePlay')) review_channel_list = str(('navershopping', 'youtube')) task_row = db.select( ''' crawl_request AS cr JOIN crawl_request_task AS crt ON cr.request_id=crt.request_id JOIN request_batch AS rb ON rb.batch_id = cr.batch_id JOIN crawl_task AS ct ON crt.task_id=ct.task_id ''', 'rb.batch_id as batch_id,cr.request_id as request_id,ct.task_id as task_id,cr.keyword as keyword,ct.n_crawl,ct.crawl_status as crawl_status,ct.channel as channel', 'ct.crawl_status="GF" and ct.channel in %s' % (blog_channel_list)) ## pred_confing obj_pred_config = Pred_config() ##predict for row in task_row: task_id = row['task_id'] channel = row['channel'] real_task = db.select('crawl_task', '*', 'task_id=%s' % (task_id)) real_crawl_status = real_task[0]['crawl_status'] if real_crawl_status == 'GF': db.update_one('crawl_task', 'crawl_status', 'SI', 'task_id', task_id) obj_predict = Predict(obj_pred_config, task_id=task_id,
from predict_execution_en_blog import * from db.almaden import Sql db = Sql("datacast2") key = 'sonnen home battery' cha = 'twitter' request_row = db.select('crawl_request','*',f'crawl_status="GF" and keyword="{key}" and channel="{cha}"') ## pred_confing ##predict for row in request_row: obj_predict = Predict(keyword=row['keyword'],channel=row['channel']) obj_predict.predict() task_ids = db.select('crawl_task','*',f'keyword="{row["keyword"]}" and channel="youtube"') for task in task_ids: db.update_one('crawl_task','crawl_status','SF','task_id',task['task_id'])
import multiprocessing import time from predict_execution_blog import * from db.almaden import Sql #시작시간 start_time = time.time() #멀티쓰레드 사용 하는 경우 (20만 카운트) #Pool 사용해서 함수 실행을 병렬 if __name__ == '__main__': process_list = [] task_list = [] db = Sql("datacast2") task_row = db.select( ''' crawl_request AS cr JOIN crawl_request_task AS crt ON cr.request_id=crt.request_id JOIN request_batch AS rb ON rb.batch_id = cr.batch_id JOIN crawl_task AS ct ON crt.task_id=ct.task_id ''', 'rb.batch_id as batch_id,cr.request_id as request_id,ct.task_id as task_id,cr.keyword as keyword,ct.n_crawl,ct.crawl_status as crawl_status,ct.channel as channel', 'ct.crawl_status="GF" and ct.channel !="navershopping" and rb.batch_id=57 limit 6' ) ## pred_confing obj_pred_config = Pred_config() ##predict # #멀티 쓰레딩 Pool 사용 for row in task_row: task_id = row['task_id'] channel = row['channel']
def __init__(self, dbName='dalmaden'): self.db = Sql(dbName) self.df = pd.DataFrame()
class Data: def __init__(self, dbName='dalmaden'): self.db = Sql(dbName) self.df = pd.DataFrame() def _load_(self, channel, keyword, fromDate, toDate, tablename='cdata'): where_str = f"keyword='{keyword}' and channel='{channel}' and post_date between '{fromDate}' and '{toDate}'" ldf = self.db.select(tablename, "*", where_str, asDataFrame=True) return ldf def addData(self, channel, keyword, fromDate, toDate, tablename='cdata', unique=True, drop_by=['keyword', 'url']): nrows0 = self.df.shape[0] ldf = self._load_(channel, keyword, fromDate, toDate, tablename) print(ldf) nrowsldf = ldf.shape[0] self.df = self.df.append(ldf) addednRows = nrowsldf droppednRows = 0 if unique: self.drop_duplicates(subset=drop_by) addednRows = self.df.shape[0] - nrows0 droppednRows = nrowsldf - addednRows print( f'addData : added {addednRows} rows (dropped {droppednRows} rows)') def drop_duplicates(self, subset=None): self.df = self.df.drop_duplicates(subset=subset) def shape(self): return self.df.shape def get_df(self, *colnames, by_sentence=''): ''' :param colnames: 행이름 str :param by_sentence: 문장 분해 대상 text 행이름 :return: DataFrame ''' df_documents = self.df.loc[:, list(colnames)] if len(by_sentence) > 0: df_sentences = pd.DataFrame() nrows = df_documents.shape[0] for i in range(nrows): if i % 100 == 0: print(f"loader : Getting Sentences {i}/{nrows}") row = df_documents.iloc[i] text = row[by_sentence] if len(text) > 0: text = cleanse_text(text) sentences = kss.split_sentences( text) #텍스트 길이 300 넘는게 허다하게 나옴... 체크 필요함 for s in sentences: s = cleanse_sentence(s) if len(s) > 0: row_temp = row.copy() row_temp[by_sentence] = s df_sentences = df_sentences.append(row_temp) else: continue print( f"loader : Getting DataFrame Done {nrows} Documents to {df_sentences.shape[0]} Sentences" ) return df_sentences else: return df_documents
class Predict: def __init__(self,keyword,channel,contents_id): self.engine = create_engine(("mysql+pymysql://{}:{}@{}:{}/{}?charset=utf8mb4").format('root','robot369', '1.221.75.76',3306,'datacast2')) self.db = Sql("datacast2") self.keyword = keyword self.channel = channel self.contents_id = contents_id # def nouns(self,phrase): # """Nouns extractor.""" # verbs = ['VV'] # tagged = pos_tag(word_tokenize(phrase)) # return [s for s, t in tagged if t in verbs] # # def verbs(self,phrase): # """Verbs extractor.""" # verbs = ['VV'] # tagged = self.nlp.pos(phrase) # return [s for s, t in tagged if t in verbs] # # def adjs(self,phrase): # # """Adjs extractor.""" # adjs = ['VA','IC'] # tagged = self.nlp.pos(phrase) # return [s for s, t in tagged if t in adjs] def read(self): # conn = pymysql.connect(host='1.221.75.76', user='******', password='******', database='datacast') # curs = conn.cursor(pymysql.cursors.DictCursor) # sql_select_sentence = 'select * from analysis_sentence' # curs.execute(sql_select_sentence) # rows = curs.fetchall() ##pandas datatable 형태로 sentece 테이블 읽어들이기 print('sql:', "SELECT ct.channel,cc.contents_id,cs.text from crawl_task as ct join crawl_contents as cc on ct.task_id=cc.task_id JOIN crawl_sentence AS cs ON cs.contents_id = cc.contents_id " "WHERE cc.contents_id=\'%s\' and ct.keyword=\'%s\'" % (self.contents_id, self.keyword)) df_sentence_rows = pd.read_sql( "SELECT ct.keyword,ct.channel,cc.contents_id as contents_id,cs.sentence_id as sentence_id, cs.text as sentence from crawl_task as ct join crawl_contents as cc on ct.task_id=cc.task_id JOIN crawl_sentence AS cs ON cs.contents_id = cc.contents_id " "WHERE cc.contents_id=\'%s\' and ct.keyword=\'%s\'" % ( self.contents_id,self.keyword), self.engine) return df_sentence_rows def predict(self): df_sentence_data_rows = self.read() sid = SentimentIntensityAnalyzer() for idx in tqdm(df_sentence_data_rows.index,desc="sentence_anlysis&db_update"): try: sentence_id = df_sentence_data_rows.at[idx,'sentence_id'] sentence = df_sentence_data_rows.at[idx,'sentence'] korean = re.compile('[\u3131-\u3163\uac00-\ud7a3]+') sentence = re.sub(korean,"",sentence) sentence = sentence.lower() nouns = [p[0] for p in pos_tag(word_tokenize(sentence), tagset='universal') if p[1] in ['NOUN']] nouns = list(filter(lambda x: (x not in stopwords) and all(stop not in x for stop in stop_list), nouns)) nouns = json.dumps(nouns, ensure_ascii=False) verbs = [p[0] for p in pos_tag(word_tokenize(sentence),tagset='universal') if p[1] in ['VERB']] verbs = list(filter(lambda x: (x not in stopwords) and all(stop not in x for stop in stop_list),verbs)) verbs = json.dumps(verbs, ensure_ascii=False) adjs = [p[0] for p in pos_tag(word_tokenize(sentence),tagset='universal') if p[1] in ['ADJ']] adjs = list(filter(lambda x: (x not in stopwords) and all(stop not in x for stop in stop_list),adjs)) adjs = json.dumps(adjs, ensure_ascii=False) pos = sid.polarity_scores(sentence) pos = 1 if pos['compound']>=0 else 0 self.db.update_multi_column("crawl_sentence", update_dict={"nouns": nouns, "verbs": verbs, "adjs": adjs, "positiveness": float(pos)}, where_dict={"sentence_id": float(sentence_id)}) except Exception as e: print(e) continue