def get_sentimental(model_name_input): model_name = model_name_input dbc = DBcontroller() good = 0 bad = 0 reply_ref_tables = dbc.execQuery( "select REPLY_REF_TABLE from MEDIA_INFO where MODEL_NAME='{}'".format( model_name)) for reply_table_index in range(len(reply_ref_tables)): reply_table = reply_ref_tables[reply_table_index][0] sentimental_array = dbc.execQuery( "select SENTIMENTAL from {}".format(reply_table)) for idx in range(len(sentimental_array)): sentimental = sentimental_array[idx][0] if (sentimental == None): continue elif (sentimental < 0): bad = bad + sentimental elif (sentimental > 0): good = good + sentimental model_infos = dbc.execQuery( "select MODEL_NAME, COUNT(MODEL_NAME), SUM(NUM_OF_REPLY) from MEDIA_INFO where MODEL_NAME='{}' GROUP BY MODEL_NAME" .format(model_name)) num_of_video = model_infos[0][1] num_of_reply = model_infos[0][2] print(model_name, good, bad, num_of_video, num_of_reply) # dbc = DBcontroller() # for idx in range(1303, 1312): # table_name = 'ap_p_ipxs_'+str(idx) # dbc.execQuery("delete from MEDIA_INFO where REPLY_REF_TABLE like '{}'".format(table_name))
def comment_proto(): modelname = '아이폰 XS' output = 'a.txt' f = open("./" + output, "w", encoding='utf-8') dbc = DBcontroller() result = dbc.execQuery( "select REPLY_REF_TABLE from MEDIA_INFO where MODEL_NAME='{}'".format( modelname)) video_count = len(result) commend_count = 0 for val in result: # val[0] now = dbc.execQuery("select CONTENT from {}".format(val[0])) commend_count = len(now) + commend_count for b in now: if (len(b) == 0): pass else: f.write(b[0]) f.write( "\n============================================================================================\n" ) f.close() print("모델 : ", modelname) print(video_count, "개의 동영상의", commend_count, "개의 댓글을 ", output, "에 저장하였음")
def whole_comment_to_txt_file(): dbc = DBcontroller() comment_table_list = [] f = open("result.txt", 'w', encoding='utf-8') count_ = 0 result = dbc.execQuery("select REPLY_REF_TABLE from MEDIA_INFO") for a in result: comment_table_list.append(a[0]) # for table in commit_table_list: # result = dbc.execQuery("select CONTENT from {}".format(table)) # target_comment = result[0][0] # count_ = len(result) + count_ # first = len(count_)//3 # second # third for table in comment_table_list: result = dbc.execQuery("select CONTENT from {}".format(table)) for a in result: target_comment = a[0].replace("\n", " ") f.write(str(target_comment)) f.write("\n") count_ = len(a) + count_ print(count_) f.close()
def insert_sentimental_summery(): dbc = DBcontroller() model_info = dbc.execQuery("select MODEL_NAME, MODEL_CODE from MODEL_INFO") for model_idx in range(len(model_info)): model_name = model_info[model_idx][0] model_code = model_info[model_idx][1] good_table_name = model_code + "_SENTIMENTAL_GOOD" bad_table_name = model_code + "_SENTIMENTAL_BAD" dbc.execQuery( "update MODEL_INFO set GOOD='{}' where MODEL_NAME='{}'".format( good_table_name, model_name)) dbc.execQuery( "update MODEL_INFO set BAD='{}' where MODEL_NAME='{}'".format( bad_table_name, model_name)) reply_ref_tables = dbc.execQuery( "select REPLY_REF_TABLE from MEDIA_INFO where MODEL_NAME='{}'". format(model_name)) make_sentimental_table(good_table_name, dbc) make_sentimental_table(bad_table_name, dbc) for idx in range(len(reply_ref_tables)): reply_table_name = reply_ref_tables[idx][0] reply_good = dbc.execQuery( "insert into {} (select * from {} where SENTIMENTAL > 3)". format(good_table_name, reply_table_name)) reply_bad = dbc.execQuery( "insert into {} (select * from {} where SENTIMENTAL < -3)". format(bad_table_name, reply_table_name))
def get_comment_by_model(file=False): dbc = DBcontroller() models = dbc.execQuery("select distinct MODEL_NAME from MEDIA_INFO") model_count = len(models) model_list = [] model_comment_array = [[] for _ in range(model_count)] for idx in range(model_count): model_list.append(models[idx][0]) for idx in range(model_count): table_names = dbc.execQuery( "select REPLY_REF_TABLE from MEDIA_INFO where MODEL_NAME='{}'". format(model_list[idx])) for idx_table in range(len(table_names)): now_table = table_names[idx_table][0] comments = dbc.execQuery( "select CONTENT from {}".format(now_table)) for comment in comments: model_comment_array[idx].append(comment[0]) model_n_comment = zip(model_list, model_comment_array) if (file == True): for model_name, comments in model_n_comment: f = open(model_name + ".txt", 'w', encoding='utf-8') print(model_name) for comment in comments: f.write(comment) f.write("\n==============\n") return model_n_comment
def __init__(self): self.dbc = DBcontroller() self.mecab = analysis() # self.mecab.getnouns(comment) self.product_name = [ "구글홈", "아이폰 XS", "갤럭시 S9", "엘지 G7", "엘지 그램 15 2018", "삼성 노트북 9 always", "갤럭시탭 S4", "아이패드 6세대", "아이패드 프로 3세대" ] self.product_name_DB_version = [ "go_s_home", "ap_p_ipxs", "ss_p_s9", "lg_p_g7", "lg_n_gram15", "ss_n_alwy9", "ss_t_galtap4", "ap_t_ipd6", "ap_t_pro3" ]
class log: def __init__(self): self.CRAWLED_DATE = datetime.datetime.today().strftime("%Y-%m-%d") self.dbc = DBcontroller() self.log_table_name = "LOG_INFO" self.WHOLE_DATA_COUNT_QUERY = "SELECT SUM(TABLE_ROWS) FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_SCHEMA='kbrs_db'" # SELECT SUM(TABLE_ROWS) # FROM INFORMATION_SCHEMA.TABLES # WHERE TABLE_SCHEMA = 'kbrs_db'; def count_log(self): result = self.dbc.execQuery(self.WHOLE_DATA_COUNT_QUERY) WHOLE_DATA_COUNT = result[0][0] check_select_Query = "select * from {} where CRAWLED_DATE='{}'".format( self.log_table_name, self.CRAWLED_DATE) check_result = self.dbc.execQuery(check_select_Query) if (len(check_result) == 0): insert_Query = "INSERT INTO {} values({}, '{}')".format( self.log_table_name, WHOLE_DATA_COUNT, self.CRAWLED_DATE) self.dbc.execQuery(insert_Query) else: update_Query = "update {} set WHOLE_DATA_COUNT={} where CRAWLED_DATE='{}'".format( self.log_table_name, WHOLE_DATA_COUNT, self.CRAWLED_DATE) # update {} set VIDEO_TITLE="{}", NUM_OF_REPLY={}, VIEWS={}, CRAWLED_DATE="{}", ' \ # 'LIKES={}, HATES={}, NUM_OF_SUBSCIBER={} where VIDEO_URL="{}" self.dbc.execQuery(update_Query)
from crawling import crawling from DBcontroller import DBcontroller import datetime if __name__ == '__main__': CRAWLED_DATE = datetime.datetime.today().strftime("%Y-%m-%d") dbc = DBcontroller() model_name = dbc.execQuery("select MODEL_NAME from MODEL_INFO") filter_word = dbc.execQuery("select FILTER_WORD from MODEL_INFO") #filter_word = [["XS"],["S9"],["G7"],["그램"],["ALWAYS","올웨이즈"],["S4"],["6세대"],["프로","3세대"]] MODEL = zip(model_name,filter_word) for MODEL_NAME, FILT in MODEL: if ',' in FILT[0]: FILT = FILT[0].split(',') else: FILT = [FILT[0]] cl = crawling(word=MODEL_NAME[0],filt=FILT) # crawling 클래스 변수 word에 원하는 제품명 넣고 크롤링 시작 url = cl.geturl() size = len(url) count_num = 1 for VIDEO_URL in url: print(VIDEO_URL) print("%d/%d 번째 동영상 크롤링 중" % (count_num,size)) count_num+=1 confirm = cl.getvideo(VIDEO_URL) if confirm < 0: continue
from DBcontroller import DBcontroller from KnuSentiLex.knusl import KnuSL from hanspell import spell_checker from morpheme import analysis import re if __name__ == '__main__': nscharacter = re.compile('[^a-zA-Z0-9`~@#$%^&*()-=_+{}\[\],/<>;\'":|\\\]+') corpos = analysis() ksl = KnuSL dbc = DBcontroller() product_name = dbc.execQuery("select DISTINCT MODEL_NAME from MEDIA_INFO") for product in product_name: COMMENT_TABLE = dbc.execQuery( "select REPLY_REF_TABLE from MEDIA_INFO where MODEL_NAME = '{}'". format(product[0])) for TABLE in COMMENT_TABLE: COMMENT = dbc.execQuery("select CONTENT from {}".format(TABLE[0])) #COMMENT = dbc.execQuery("select CONTENT from {} where SENTIMENTAL='NULL'".format(TABLE[0])) for TEXT in COMMENT: score = 0 neg = 0 pos = 0 REPLACE_TEXT = TEXT[0].replace('\\', '\\\\').replace('"', '\\"') try: check = spell_checker.check(REPLACE_TEXT) reduction = "".join(nscharacter.findall(check.checked)) #words = corpos.mecabpos(reduction)
from DBcontroller import DBcontroller from analysis import analysis from selenium import webdriver from selenium.webdriver.common.keys import Keys from bs4 import BeautifulSoup import time dbc = DBcontroller() class demo: def __init__(self, url): self.url = url self.options = webdriver.ChromeOptions() self.options.add_argument('--headless') self.options.add_argument('lang=ko_KR') self.options.add_argument('--window-size=1920,1080') def test(self): driver = webdriver.Chrome('./chromedriver', chrome_options=self.options) driver.get(self.url) time.sleep(2) driver.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN) time.sleep(2) self.__button(driver, "//*[@id='more']/yt-formatted-string") video_html = BeautifulSoup(driver.page_source, 'html.parser') # 스크롤로 인해서 업데이트된 웹을 html로 파싱
''' self.dispatchid = dispatchid self.typename = str(typename) self.stage = stage self.name = str(name) self.char1id = str(char1id) self.char2id = str(char2id) self.char3id = str(char3id) self.char4id = str(char4id) #connection = mysql.connector.connect( #host="localhost", #user="******", #password="******", port=3306, database="danmemo") db = DBcontroller("localhost", "root", "danmemo", "3306", "danmemo") dispatch_dict = dict() with open('dispatchQuest/dispatch.txt', 'r') as f: line = f.readline() while (line): stage = None split_list = line.split(" - ") split_list2 = split_list[1].split(":") #print(split_list) temp = split_list[0] if ("(" in temp): typename = temp[:temp.find('(')] stage = temp[temp.find('('):]
def __init__(self): self.dbc = DBcontroller() self.analysis = analysis()
from DBcontroller import DBcontroller from analysis import analysis from selenium import webdriver from selenium.webdriver.common.keys import Keys from bs4 import BeautifulSoup import time dbc = DBcontroller() class demo: def __init__(self, url): self.url = url self.options = webdriver.ChromeOptions() self.options.add_argument('--headless') self.options.add_argument('lang=ko_KR') self.options.add_argument('--window-size=1920,1080') def test(self): driver = webdriver.Chrome('./chromedriver', chrome_options=self.options) driver.get(self.url) time.sleep(2) driver.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN) time.sleep(2) self.__button(driver, "//*[@id='more']/yt-formatted-string") self.video_html = BeautifulSoup(driver.page_source, 'html.parser') video_category = self.__immutabilityvideoinformation() driver.quit()
import requests from DBcontroller import DBcontroller from bs4 import BeautifulSoup import datetime if __name__ == '__main__': dbc = DBcontroller() CRAWLED_DATE = datetime.datetime.today().strftime("%Y-%m-%d") model_name = dbc.execQuery("select MODEL_NAME from MODEL_INFO") for idx in range(len(model_name)): headers = { "Referer": "http://search.danawa.com/dsearch.php?query=" + str(model_name[idx][0].encode()), "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36" } data = {"query": model_name[idx][0]} check_name_value = 0 check_cost_value = False res = requests.post( "http://search.danawa.com/ajax/getProductList.ajax.php", headers=headers, data=data) soup = BeautifulSoup(res.text, "html.parser") products = soup.select('ul.product_list > li.prod_item ') for pidx in range(len(products)):
class DBnlp_saver: def __init__(self): self.dbc = DBcontroller() self.mecab = analysis() # self.mecab.getnouns(comment) self.product_name = [ "구글홈", "아이폰 XS", "갤럭시 S9", "엘지 G7", "엘지 그램 15 2018", "삼성 노트북 9 always", "갤럭시탭 S4", "아이패드 6세대", "아이패드 프로 3세대" ] self.product_name_DB_version = [ "go_s_home", "ap_p_ipxs", "ss_p_s9", "lg_p_g7", "lg_n_gram15", "ss_n_alwy9", "ss_t_galtap4", "ap_t_ipd6", "ap_t_pro3" ] def comment_to_result_of_nlp(self): model_names = self.get_model_names() for model_name in model_names: bool_exist_table = self.boolean_exist_table(model_name) if (bool_exist_table): pass else: self.make_frequency_table_by_model(model_name) reply_table_names = self.get_table_name_by_model(model_name) for reply_table_name in reply_table_names: replies = self.dbc.execQuery( "select CONTENT, ID from {}".format(str(reply_table_name))) for idx in range(len(replies)): #merge_nouns_by_comma = "" reply = replies[idx][0] ID = replies[idx][1] nouns = self.mecab.mecabnouns(str(reply)) if (len(nouns) == 0): pass else: for noun in nouns: self.register_frequency_word_in_table( model_name=model_name, word=noun) #merge_nouns_by_comma = merge_nouns_by_comma + noun + ", " #merge_nouns_by_comma = merge_nouns_by_comma[:-2] #update_reply_table = "update {0} set NOUNS_FROM_COMMENT='{1}' where ID={2}" #update_query = update_reply_table.format(reply_table_name, merge_nouns_by_comma, ID) #self.dbc.execQuery(update_query) def add_colmn_nlp_result_REPLY_REF_TABLE(self): col_info = "NOUNS_FROM_COMMENT varchar(1023) NOT NULL" model_names = self.get_model_names() idx = 0 for model_name in model_names: reply_table_names = self.get_table_name_by_model(model_name) for table_name in reply_table_names: print("alter table {0} add {1}".format(table_name, col_info)) self.dbc.execQuery("alter table {0} add {1}".format( table_name, col_info)) print("alter table {0} add {1}".format(table_name, col_info)) idx = idx + 1 print("add_colmn_complete : ", idx) ## 아래로는 사용하는 자체 method이다. def get_model_names(self): model_names = [] model_name_array = self.dbc.execQuery( "select distinct MODEL_NAME from MEDIA_INFO") for idx in range(len(model_name_array)): model_names.append(model_name_array[idx][0]) return model_names def get_table_name_by_model(self, model_name=None): reply_tables = [] if model_name == None: reply_table_array = self.dbc.execQuery( "select REPLY_REF_TABLE from MEDIA_INFO") else: reply_table_array = self.dbc.execQuery( "select REPLY_REF_TABLE from MEDIA_INFO where MODEL_NAME='{}'". format(model_name)) for idx in range(len(reply_table_array)): reply_tables.append(reply_table_array[idx][0]) return reply_tables def make_frequency_table_by_model(self, model_name): table_name = self.get_table_name(model_name) schema_create_query = """ CREATE TABLE {}( WORD varchar(63) NOT NULL, FREQUENCY int NOT NULL, PRIMARY KEY (WORD) ) """ schema = schema_create_query.format(table_name) self.dbc.execQuery(schema) def boolean_exist_table(self, model_name): table_name = self.get_table_name(model_name) result = self.dbc.execQuery("select * from {}".format(table_name)) if (len(result) == 0): return False else: return True def get_table_name(self, model_name): model_code = "" for idx in range(len(self.product_name)): if (model_name == self.product_name[idx]): model_code = self.product_name_DB_version[idx] table_name = str(model_code) + "_FREQUENCY" return table_name def register_frequency_word_in_table(self, model_name, word): table_name = self.get_table_name(model_name) result = self.dbc.execQuery( "select FREQUENCY from {} where WORD='{}'".format( table_name, word)) if (len(result) == 0): self.insert_noun_frequency_table(model_name, word) else: FREQUENCY = result[0][0] self.update_noun_frequency_table(model_name, word, FREQUENCY) def insert_noun_frequency_table(self, model_name, word): table_name = self.get_table_name(model_name) self.dbc.execQuery("insert into {0} values('{1}', {2})".format( table_name, word, 1)) def update_noun_frequency_table(self, model_name, word, FREQUENCY): table_name = self.get_table_name(model_name) new_FREQUENCY = FREQUENCY + 1 self.dbc.execQuery( "update {0} set FREQUENCY={1} where WORD='{2}'".format( table_name, new_FREQUENCY, word))
return_reply_table_name, content, CRAWLED_DATE, author) print("complete:insert case") elif len(result_list) == 7: print("update table") VIEWS, LIKES, HATES, NUM_OF_SUBSCIBER, VIDEO_TITLE, NUM_OF_REPLY, COMMENT = result_list before_comment_count = dbc.get_num_of_reply_by_url(VIDEO_URL) after_comment_count = NUM_OF_REPLY - before_comment_count Reply_ref_table_name = dbc.Update_video_in_MEDIA_INFO_return_reply_table( VIDEO_URL, VIDEO_TITLE, NUM_OF_REPLY, VIEWS, CRAWLED_DATE, LIKES, HATES, NUM_OF_SUBSCIBER) for author, content in reversed(COMMENT[:after_comment_count]): dbc.Insert_reply_info_by_table_name( Reply_ref_table_name, content, CRAWLED_DATE, author) print("complete2:update case") except: print("동영상 또는 댓글이 없습니다.") continue if __name__ == '__main__': dbc = DBcontroller() CRAWLED_DATE, MODEL = ready_date() for MODEL_NAME, FILT in MODEL: if ',' in FILT[0]: FILT = FILT[0].split(',') else: FILT = [FILT[0]] start_crawling(MODEL_NAME[0], FILT)
def __init__(self): self.CRAWLED_DATE = datetime.datetime.today().strftime("%Y-%m-%d") self.dbc = DBcontroller() self.log_table_name = "LOG_INFO" self.WHOLE_DATA_COUNT_QUERY = "SELECT SUM(TABLE_ROWS) FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_SCHEMA='kbrs_db'"
from DBcontroller import DBcontroller from hanspell import spell_checker from morpheme import analysis import re if __name__ == '__main__': dbc = DBcontroller() nscharacter = re.compile( '[^a-zA-Z0-9`~!@#$%^&*()-=_+{}\[\],./<>?;\'":|\\\]+') corpos = analysis() Querys = dbc.execQuery("select CONTENT from ap_p_ipxs_870") Result_list = [] for tmp in Querys: Result_list.append(tmp[0]) for result in Result_list: REPLACE_TEXT = result.replace('\\', '\\\\').replace('"', '\\"') try: check = spell_checker.check(REPLACE_TEXT) reduction = "".join(nscharacter.findall(check.checked)) words = corpos.mecabpos(reduction) except: reduction = "".join(nscharacter.findall(REPLACE_TEXT)) words = corpos.mecabpos(reduction) print(words) ''' CONTENT = dbc.execQuery("select CONTENT FROM {} where CONTENT like '%보겸%'".format(result)) for text in CONTENT: print(text)