def train_insert_history_words():
	with open('train_words.csv','r') as f:
		reader = csv.reader(f)
		for row in reader:
			check = Mysql.check_history_word(row[0])
			print( x + ':' + str(check))
			if check == 'nothing' and check != '':
				print('Classify the ' + row[0] )
				cat = classify.word_classify(row[0])
				print('Classify finished')
				print('Insert "'+ row[0] +'"" to hisotory_table')
				Mysql.insert_word_to_hisotory_table(row[0],row[1],cat)
				print('Insert "'+ row[0] +'"" to hisotory_table finished')
			if check == 2 : continue
			if check == 1 : continue
Exemple #2
0
def auto_Firefox():
	driver = webdriver.Firefox()
	driver.get("https://www.google.co.jp/")
	with open('keywords.csv','r') as f:
		reader = csv.reader(f)
		for row in reader:
			driver.find_element_by_id("lst-ib").clear() 
			driver.find_element_by_id("lst-ib").send_keys(row[0])
			driver.find_element_by_id("lst-ib").send_keys(Keys.ENTER)
			driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
			related_words = driver.find_elements_by_class_name("nVcaUb")
			i = -1
			for related_word in related_words:
				i +=1
				#ページはjsにリフレッシュされたため、もう一回要素取得しないといけない。
				#参考:https://www.cnblogs.com/fengpingfan/p/4583325.html
				related_words02 = driver.find_elements_by_class_name("nVcaUb")
				rel_word = related_words02[i].text
				check = Mysql.check_history_word(rel_word,row[1])
				print( rel_word + ':' + str(check))
				if check == 'nothing' :
					cat = classify.word_classify(rel_word,row[1])
					Mysql.insert_word_to_hisotory_table(rel_word,,row[1],cat)
				if check == 2 : continue
				if check == 1 :
					related_words02[i].click()
					print( rel_word +'clicked')
					time.sleep(2)
					#前のページに戻らないと今のページ内の関連ワードをクリック続けるようになってしまいます。
					driver.back()
			time.sleep(1)
	driver.quit()
	t=threading.Timer(1,auto_Firefox)
	t.start()
Exemple #3
0
def auto_chrome():
	driver = webdriver.Chrome(r"C:\Users\kou-k\AppData\Local\driver\chromedriver.exe",desired_capabilities=d)
	driver.get("https://www.google.co.jp/")
	with open('keywords.csv','r') as f:
		reader = csv.reader(f)
		for row in reader:
			driver.find_element_by_id("lst-ib").clear() 
			driver.find_element_by_id("lst-ib").send_keys(row[0])
			driver.find_element_by_id("lst-ib").send_keys(Keys.ENTER)
			time.sleep(2)
			driver.find_element_by_id("lst-ib").click()	
			related_words = driver.find_elements_by_class_name("sbqs_c")
			i = -1
			for related_word in related_words:
				i +=1
				#ページはjsにリフレッシュされたため、もう一回要素取得しないといけない。
				#参考:https://www.cnblogs.com/fengpingfan/p/4583325.html
				related_words02 = driver.find_elements_by_class_name("sbqs_c")
				rel_word = related_words02[i].text
				check = Mysql.check_history_word(rel_word,row[1])
				print( rel_word + ':' + str(check))
				if check == 'nothing' :
					cat = classify.word_classify(rel_word,row[1])
					Mysql.insert_word_to_hisotory_table(rel_word,,row[1],cat)
				if check == 2 : continue
				if check == 1 :
					related_words02[i].click()
					print( rel_word +'clicked')
					time.sleep(2)
					#前のページに戻らないと今のページ内の関連ワードをクリック続けるようになってしまいます。
					driver.back()
			time.sleep(1)
	driver.quit()
	t=threading.Timer(1,auto_chrome)
	t.start()
Exemple #4
0
def auto_chrome():
	driver = webdriver.Chrome("C:/Users/kou-k/AppData/Local/driver/chromedriver.exe",desired_capabilities=d)
	driver.get("https://www.google.co.jp/")
	with open('keywords.csv','r') as f:
		reader = csv.reader(f)
		for row in reader:
			driver.find_element_by_id("lst-ib").clear()
			check = Mysql.check_history_word(row[0],row[1])
			print( x + ':' + str(check))
			if check == 'nothing' :
				cat = classify.word_classify(row[0],row[1])
				Mysql.insert_word_to_hisotory_table(row[0],row[1],cat)
			if check == 2 : continue
			if check == 1 :
				driver.find_element_by_id("lst-ib").send_keys(row[0])
				driver.find_element_by_id("lst-ib").send_keys(Keys.ENTER)
				print( driver.title)
			time.sleep(1)
	driver.quit()
	t=threading.Timer(1,auto_chrome)
	t.start()
Exemple #5
0
def auto_Firefox():
    with open('positive-keywords.csv', 'r') as f:
        reader = csv.reader(f)
        for row in reader:
            driver = webdriver.Firefox()
            driver.get("https://www.google.co.jp/")
            driver.find_element_by_id("lst-ib").clear()
            check = Mysql.check_history_word(row[0], row[1])
            print(x + ':' + str(check))
            if check == 'nothing':
                cat = classify.word_classify(row[0], row[1])
                Mysql.insert_word_to_hisotory_table(row[0], row[1], cat)
            if check == 2: continue
            if check == 1:
                driver.find_element_by_id("lst-ib").send_keys(row[0])
                driver.find_element_by_id("lst-ib").send_keys(Keys.ENTER)
                print(driver.title)
            time.sleep(1)
            driver.delete_all_cookies()
            driver.close()
    t = threading.Timer(1, auto_Firefox)
    t.start()
Exemple #6
0
 def __init__(self):
     self.Mysql = Mysql()
     self.POSSITIVE = 1
     self.NEGATIVE = 2
Exemple #7
0
class NaiveBayes:
    def __init__(self):
        self.Mysql = Mysql()
        self.POSSITIVE = 1
        self.NEGATIVE = 2

    def get_words(self, company_id):
        vocabularies = self.Mysql.get_vocabularise(company_id)
        return vocabularies

    def words_in_cat(self, category, company_id):
        words_in_cat = self.Mysql.get_words_in_cat(category, company_id)
        return words_in_cat

    def word_count_in_cat(self, word, category, company_id):
        word_count_in_cat = self.Mysql.get_word_in_cat(word, category,
                                                       company_id)
        return word_count_in_cat

    def all_cats_count(self, company_id):
        all_cat_count = self.Mysql.get_all_cats_count(company_id)
        return all_cat_count

    def cat_count(self, category, company_id):
        cat_count = self.Mysql.get_cat_count(category, company_id)
        return cat_count

    def word_count(self, word, company_id):
        word_count = self.Mysql.get_word_count(word, company_id)
        return word_count

    def word_in_cat_probability(self, word, category, company_id):
        #p(word|cat)を計算する
        #ラプラススムージング を適用
        vocabularies = self.get_words(company_id)
        words_in_cat = self.words_in_cat(category, company_id)
        word_count = self.word_count_in_cat(word, category, company_id) + 1
        vocabulary_count = len(words_in_cat) + len(
            vocabularies)  #取得した配列の最後に''要素があるから、それを消す
        return float(word_count) / float(vocabulary_count)

    def cat_probability(self, category, company_id):
        #p(cat)を計算する
        all_cats_count = self.all_cats_count(company_id)
        cat_count = self.cat_count(category, company_id)
        return float(cat_count) / float(all_cats_count)

    def word_probability(self, word, company_id):
        #p(word)を計算する
        #ラプラススムージング を適用
        vocabularies = self.get_words(company_id)
        word_count = self.word_count(word, company_id) + 1
        return float(word_count) / float(
            len(vocabularies) *
            2)  #ラプラススムージングの為にlen(vocabularies) +len(vocabularies)を掛ける2にしました。

    def cat_if_document(self, document, category, company_id):
        #p(cat|document)を計算する
        #p(cat|document) = p(document|cat)*p(cat)/p(document)
        #= p(word1|cat)*p(word2|cat)*...p(wordn|cat)*p(cat)/(p(word1)*p(word2)*p(word2)*...*p(wordn))
        cat_probability = math.log(self.cat_probability(category, company_id))

        words = self.Mysql.ja_tokenize(document)
        word_in_cat_probability = 0
        word_probability = 0
        for word in words:
            word_in_cat_probability += math.log(
                self.word_in_cat_probability(word, category, company_id))
            word_probability += math.log(
                self.word_probability(word, company_id))
        cat_if_document_probability = word_in_cat_probability + cat_probability - word_probability
        return cat_if_document_probability

    def document_classify(self, document, company_id):

        document_possitive_p = self.cat_if_document(document, self.POSSITIVE,
                                                    company_id)
        document_negative_p = self.cat_if_document(document, self.NEGATIVE,
                                                   company_id)

        p_list = [document_possitive_p, document_negative_p]

        document_cat_num = 0
        if max(p_list) == document_possitive_p:
            document_cat = 'pos'
            document_cat_num += 1

        if max(p_list) == document_negative_p:
            document_cat = 'neg'
            document_cat_num += 2
        print(document)
        self.Mysql.insert_into_db(document, company_id, document_cat_num)
        return document_cat

    def word_classify(self, word, company_id):

        html = requests.get('https://www.google.co.jp/search?q=' +
                            word).content
        soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8')
        dicriptions = soup.find_all("div", class_="g")
        z = []
        for dicription in dicriptions:
            z.append(dicription.get_text())
        document_classify = []
        for o in z:
            document_classify.append(self.document_classify(o, company_id))
            #print('この記事のカテゴリー:'+ self.document_classify(o))
        possitive_num = document_classify.count('pos')
        negative_num = document_classify.count('neg')
        word_cat = ''
        if possitive_num > negative_num:
            word_cat += 'possitive'
        if possitive_num < negative_num:
            word_cat += 'negative'
        return word_cat
with open('train_words.csv', 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        search.append(x)
print(search)


#キーワードの検索結果をスクレイピング
def get_sentence(search):
    x = []
    for a in search:
        x.append(['https://www.google.co.jp/search?q=' + a[0], a[1]])

    z = []
    for y in x:

        html = requests.get(y[0]).content
        soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8')
        dicriptions = soup.find_all("div", class_="g")

        for dicription in dicriptions:

            z.append([dicription.get_text(), y[1]])
    return z


print('Insert the words into database')
sentences = get_sentence(search)
db = Mysql()
db.fuhyou_insert_sentence(sentences)
print('Insert finished')
Exemple #9
0
import csv
import time
import threading
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.common.keys import Keys
from threading import Timer
from classify import NaiveBayes
from dicmysql import Mysql
d = DesiredCapabilities.CHROME
d['loggingPrefs'] = {'browser': 'ALL'}
classify = NaiveBayes()
Mysql = Mysql()


def auto_Firefox():
    with open('positive-keywords.csv', 'r') as f:
        reader = csv.reader(f)
        for row in reader:
            driver = webdriver.Firefox()
            driver.get("https://www.google.co.jp/")
            driver.find_element_by_id("lst-ib").clear()
            check = Mysql.check_history_word(row[0], row[1])
            print(x + ':' + str(check))
            if check == 'nothing':
                cat = classify.word_classify(row[0], row[1])
                Mysql.insert_word_to_hisotory_table(row[0], row[1], cat)
            if check == 2: continue
            if check == 1:
                driver.find_element_by_id("lst-ib").send_keys(row[0])
                driver.find_element_by_id("lst-ib").send_keys(Keys.ENTER)