def train_insert_history_words(): with open('train_words.csv','r') as f: reader = csv.reader(f) for row in reader: check = Mysql.check_history_word(row[0]) print( x + ':' + str(check)) if check == 'nothing' and check != '': print('Classify the ' + row[0] ) cat = classify.word_classify(row[0]) print('Classify finished') print('Insert "'+ row[0] +'"" to hisotory_table') Mysql.insert_word_to_hisotory_table(row[0],row[1],cat) print('Insert "'+ row[0] +'"" to hisotory_table finished') if check == 2 : continue if check == 1 : continue
def auto_Firefox(): driver = webdriver.Firefox() driver.get("https://www.google.co.jp/") with open('keywords.csv','r') as f: reader = csv.reader(f) for row in reader: driver.find_element_by_id("lst-ib").clear() driver.find_element_by_id("lst-ib").send_keys(row[0]) driver.find_element_by_id("lst-ib").send_keys(Keys.ENTER) driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") related_words = driver.find_elements_by_class_name("nVcaUb") i = -1 for related_word in related_words: i +=1 #ページはjsにリフレッシュされたため、もう一回要素取得しないといけない。 #参考:https://www.cnblogs.com/fengpingfan/p/4583325.html related_words02 = driver.find_elements_by_class_name("nVcaUb") rel_word = related_words02[i].text check = Mysql.check_history_word(rel_word,row[1]) print( rel_word + ':' + str(check)) if check == 'nothing' : cat = classify.word_classify(rel_word,row[1]) Mysql.insert_word_to_hisotory_table(rel_word,,row[1],cat) if check == 2 : continue if check == 1 : related_words02[i].click() print( rel_word +'clicked') time.sleep(2) #前のページに戻らないと今のページ内の関連ワードをクリック続けるようになってしまいます。 driver.back() time.sleep(1) driver.quit() t=threading.Timer(1,auto_Firefox) t.start()
def auto_chrome(): driver = webdriver.Chrome(r"C:\Users\kou-k\AppData\Local\driver\chromedriver.exe",desired_capabilities=d) driver.get("https://www.google.co.jp/") with open('keywords.csv','r') as f: reader = csv.reader(f) for row in reader: driver.find_element_by_id("lst-ib").clear() driver.find_element_by_id("lst-ib").send_keys(row[0]) driver.find_element_by_id("lst-ib").send_keys(Keys.ENTER) time.sleep(2) driver.find_element_by_id("lst-ib").click() related_words = driver.find_elements_by_class_name("sbqs_c") i = -1 for related_word in related_words: i +=1 #ページはjsにリフレッシュされたため、もう一回要素取得しないといけない。 #参考:https://www.cnblogs.com/fengpingfan/p/4583325.html related_words02 = driver.find_elements_by_class_name("sbqs_c") rel_word = related_words02[i].text check = Mysql.check_history_word(rel_word,row[1]) print( rel_word + ':' + str(check)) if check == 'nothing' : cat = classify.word_classify(rel_word,row[1]) Mysql.insert_word_to_hisotory_table(rel_word,,row[1],cat) if check == 2 : continue if check == 1 : related_words02[i].click() print( rel_word +'clicked') time.sleep(2) #前のページに戻らないと今のページ内の関連ワードをクリック続けるようになってしまいます。 driver.back() time.sleep(1) driver.quit() t=threading.Timer(1,auto_chrome) t.start()
def auto_chrome(): driver = webdriver.Chrome("C:/Users/kou-k/AppData/Local/driver/chromedriver.exe",desired_capabilities=d) driver.get("https://www.google.co.jp/") with open('keywords.csv','r') as f: reader = csv.reader(f) for row in reader: driver.find_element_by_id("lst-ib").clear() check = Mysql.check_history_word(row[0],row[1]) print( x + ':' + str(check)) if check == 'nothing' : cat = classify.word_classify(row[0],row[1]) Mysql.insert_word_to_hisotory_table(row[0],row[1],cat) if check == 2 : continue if check == 1 : driver.find_element_by_id("lst-ib").send_keys(row[0]) driver.find_element_by_id("lst-ib").send_keys(Keys.ENTER) print( driver.title) time.sleep(1) driver.quit() t=threading.Timer(1,auto_chrome) t.start()
def auto_Firefox(): with open('positive-keywords.csv', 'r') as f: reader = csv.reader(f) for row in reader: driver = webdriver.Firefox() driver.get("https://www.google.co.jp/") driver.find_element_by_id("lst-ib").clear() check = Mysql.check_history_word(row[0], row[1]) print(x + ':' + str(check)) if check == 'nothing': cat = classify.word_classify(row[0], row[1]) Mysql.insert_word_to_hisotory_table(row[0], row[1], cat) if check == 2: continue if check == 1: driver.find_element_by_id("lst-ib").send_keys(row[0]) driver.find_element_by_id("lst-ib").send_keys(Keys.ENTER) print(driver.title) time.sleep(1) driver.delete_all_cookies() driver.close() t = threading.Timer(1, auto_Firefox) t.start()
def __init__(self): self.Mysql = Mysql() self.POSSITIVE = 1 self.NEGATIVE = 2
class NaiveBayes: def __init__(self): self.Mysql = Mysql() self.POSSITIVE = 1 self.NEGATIVE = 2 def get_words(self, company_id): vocabularies = self.Mysql.get_vocabularise(company_id) return vocabularies def words_in_cat(self, category, company_id): words_in_cat = self.Mysql.get_words_in_cat(category, company_id) return words_in_cat def word_count_in_cat(self, word, category, company_id): word_count_in_cat = self.Mysql.get_word_in_cat(word, category, company_id) return word_count_in_cat def all_cats_count(self, company_id): all_cat_count = self.Mysql.get_all_cats_count(company_id) return all_cat_count def cat_count(self, category, company_id): cat_count = self.Mysql.get_cat_count(category, company_id) return cat_count def word_count(self, word, company_id): word_count = self.Mysql.get_word_count(word, company_id) return word_count def word_in_cat_probability(self, word, category, company_id): #p(word|cat)を計算する #ラプラススムージング を適用 vocabularies = self.get_words(company_id) words_in_cat = self.words_in_cat(category, company_id) word_count = self.word_count_in_cat(word, category, company_id) + 1 vocabulary_count = len(words_in_cat) + len( vocabularies) #取得した配列の最後に''要素があるから、それを消す return float(word_count) / float(vocabulary_count) def cat_probability(self, category, company_id): #p(cat)を計算する all_cats_count = self.all_cats_count(company_id) cat_count = self.cat_count(category, company_id) return float(cat_count) / float(all_cats_count) def word_probability(self, word, company_id): #p(word)を計算する #ラプラススムージング を適用 vocabularies = self.get_words(company_id) word_count = self.word_count(word, company_id) + 1 return float(word_count) / float( len(vocabularies) * 2) #ラプラススムージングの為にlen(vocabularies) +len(vocabularies)を掛ける2にしました。 def cat_if_document(self, document, category, company_id): #p(cat|document)を計算する #p(cat|document) = p(document|cat)*p(cat)/p(document) #= p(word1|cat)*p(word2|cat)*...p(wordn|cat)*p(cat)/(p(word1)*p(word2)*p(word2)*...*p(wordn)) cat_probability = math.log(self.cat_probability(category, company_id)) words = self.Mysql.ja_tokenize(document) word_in_cat_probability = 0 word_probability = 0 for word in words: word_in_cat_probability += math.log( self.word_in_cat_probability(word, category, company_id)) word_probability += math.log( self.word_probability(word, company_id)) cat_if_document_probability = word_in_cat_probability + cat_probability - word_probability return cat_if_document_probability def document_classify(self, document, company_id): document_possitive_p = self.cat_if_document(document, self.POSSITIVE, company_id) document_negative_p = self.cat_if_document(document, self.NEGATIVE, company_id) p_list = [document_possitive_p, document_negative_p] document_cat_num = 0 if max(p_list) == document_possitive_p: document_cat = 'pos' document_cat_num += 1 if max(p_list) == document_negative_p: document_cat = 'neg' document_cat_num += 2 print(document) self.Mysql.insert_into_db(document, company_id, document_cat_num) return document_cat def word_classify(self, word, company_id): html = requests.get('https://www.google.co.jp/search?q=' + word).content soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8') dicriptions = soup.find_all("div", class_="g") z = [] for dicription in dicriptions: z.append(dicription.get_text()) document_classify = [] for o in z: document_classify.append(self.document_classify(o, company_id)) #print('この記事のカテゴリー:'+ self.document_classify(o)) possitive_num = document_classify.count('pos') negative_num = document_classify.count('neg') word_cat = '' if possitive_num > negative_num: word_cat += 'possitive' if possitive_num < negative_num: word_cat += 'negative' return word_cat
with open('train_words.csv', 'r') as f: reader = csv.reader(f) for row in reader: search.append(x) print(search) #キーワードの検索結果をスクレイピング def get_sentence(search): x = [] for a in search: x.append(['https://www.google.co.jp/search?q=' + a[0], a[1]]) z = [] for y in x: html = requests.get(y[0]).content soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8') dicriptions = soup.find_all("div", class_="g") for dicription in dicriptions: z.append([dicription.get_text(), y[1]]) return z print('Insert the words into database') sentences = get_sentence(search) db = Mysql() db.fuhyou_insert_sentence(sentences) print('Insert finished')
import csv import time import threading from selenium import webdriver from selenium.webdriver.common.desired_capabilities import DesiredCapabilities from selenium.webdriver.common.keys import Keys from threading import Timer from classify import NaiveBayes from dicmysql import Mysql d = DesiredCapabilities.CHROME d['loggingPrefs'] = {'browser': 'ALL'} classify = NaiveBayes() Mysql = Mysql() def auto_Firefox(): with open('positive-keywords.csv', 'r') as f: reader = csv.reader(f) for row in reader: driver = webdriver.Firefox() driver.get("https://www.google.co.jp/") driver.find_element_by_id("lst-ib").clear() check = Mysql.check_history_word(row[0], row[1]) print(x + ':' + str(check)) if check == 'nothing': cat = classify.word_classify(row[0], row[1]) Mysql.insert_word_to_hisotory_table(row[0], row[1], cat) if check == 2: continue if check == 1: driver.find_element_by_id("lst-ib").send_keys(row[0]) driver.find_element_by_id("lst-ib").send_keys(Keys.ENTER)