コード例 #1
0
    def setUp(self):
        self.nanapi_article_page = WebPage()
        nanapi_file = open('test_support/nanapi.html', encoding='utf-8')
        nanapi_html = nanapi_file.read()
        nanapi_file.close()
        self.nanapi_article_page.html_body = nanapi_html
        self.nanapi_article_page.url = 'http://nanapi.jp'

        self.nanapi_hay_fever_page = WebPage('http://nanapi.jp')
        nanapi_hay_fever_file = open('test_support/nanapi_hay_fever.html',
                                     encoding='utf-8')
        nanapi_hay_fever_html = nanapi_hay_fever_file.read()
        nanapi_hay_fever_file.close()
        self.nanapi_hay_fever_page.html_body = nanapi_hay_fever_html

        self.gow_marriage_page = WebPage()
        gow_file = open('test_support/gow.html', encoding='utf-8')
        gow_html = gow_file.read()
        gow_file.close()
        self.gow_marriage_page.html_body = gow_html
        self.gow_marriage_page.url = 'http://magazine.gow.asia/love/column_details.php?column_uid=00000082'

        self.kanemoti_page = WebPage()
        kanemoti_file = open('test_support/kanemotilevel.html',
                             encoding='utf-8')
        kanemoti_html = kanemoti_file.read()
        kanemoti_file.close()
        self.kanemoti_page.html_body = kanemoti_html
コード例 #2
0
    def test_find_pages(self):
        page_1 = WebPage('http://tradein.nissan.co.jp/')
        page_1.title = '自動車の下取りと売却'
        page_1.snippet = '自動車には下取りをする方法がけっこうある。'

        page_2 = WebPage('http://www.link-nexus.com/')
        page_2.title = '自動車の下取りと販売'
        page_2.snippet = 'あばばばばば'

        page_3 = WebPage('http://toyota.jp/service/tradein/dc/top')
        page_3.title = '下取り参考価格情報'
        page_3.snippet = '下取りと販売ですよプロデューサーさん'

        search_engine = SearchEngine()
        search_engine.material_pages = [page_1, page_2, page_3]
        search_engine.hint_word = '自動車'
        search_engine.action_word = '下取り'
        search_engine.find_pages_including_related_words()
        self.assertEqual(search_engine.result_pages[0], page_1)
        self.assertEqual(search_engine.result_pages[1], page_2)
        self.assertEqual(search_engine.result_pages[2], page_3)

        search_engine.count_action_words()
        self.assertEqual(search_engine.action_words_count, {'販売': 2, '売却': 1})

        search_engine.sort_action_words_count()
        self.assertEqual(search_engine.sorted_action_words, [{
            'word': '販売',
            'count': 2
        }, {
            'word': '売却',
            'count': 1
        }])
コード例 #3
0
 def setUp(self):
     # 適当なWebPage作って、Taskをsetする。
     page_1 = WebPage(url='http://aaa.com', query='職業 質問する')
     page_1.text = '医師に質問してください。'
     page_2 = WebPage(url='http://bbb.com', query='職業 質問する')
     page_2.text = '看護師に質問してください。'
     page_3 = WebPage(url='http://ccc.com', query='職業 質問する')
     page_3.text = '理学療法士に質問してください。'
     self.graph = self.build_graph([page_1, page_2, page_3])
コード例 #4
0
 def test_instance_of_task_clusters_exclude_part_of(self):
     page_1 = WebPage(url='somewhere', query='チョコレート 食べる')
     page_1.text = 'ヨドバシカメラに行く必要があります。お金を払ってください。' \
                   'ヨドバシカメラに行く必要があります。お金を払ってください。'
     page_2 = WebPage(url='elsewhere', query='チョコレート 食べる')
     page_2.text = '神社にお参りしてください。'
     page_3 = WebPage(url='anywhere', query='チョコレート 食べる')
     page_3.text = '神社にお参りしてください。'
     page_4 = WebPage(url='where', query='チョコレート 食べる')
     page_4.text = 'お金を払いましょう'
     graph = self.build_graph([page_1, page_2, page_3, page_4])
     answerer = TaskGraphFirstAnswerer(graph=graph, query_task='チョコレート_食べる')
     answerer.set_result_tasks()
     i_clusters = answerer.instance_of_task_clusters
     self.assertEqual(i_clusters, [{'神社_お参りする'}])
コード例 #5
0
def start_check():
    try:
        # 1. Open login page
        login_page = WebPage(url="%s%s" % (UKVISA_BASE_URL, LOGIN_PATH),
                             handler=s)

        login_form_data = get_login_form_data(login_page.content())

        # 2. Post login form and login :)
        login_form = create_login_form(login_form_data)
        res = login_form.submit().response()

        # 3. Open book appointment page
        book_appointment_page = WebPage(url=get_book_appointment_page_url(),
                                        handler=s)

        get_appointment_location_form = WebForm(
            action_url="%s%s" % (UKVISA_BASE_URL, APPOINTMENT_LOCATION_PATH),
            form_data=get_appointment_loc_form_data(
                book_appointment_page.content()),
            handler=s)

        res = get_appointment_location_form.submit().response()

        schedule_appointment_form_data = get_appointment_schedule_form_data(
            res.text)

        post_id = schedule_appointment_form_data['EnrolmentStationId']

        available_dates = get_available_dates()

        print("\n" + datetime.datetime.now().strftime("%I:%M%p on %B %d, %Y"))
        if len(available_dates) > 0:
            appointment_data = {
                date: get_available_time_slots(post_id=post_id,
                                               appointment_date=date)
                for date in available_dates[:7]
            }
            print('Available dates found, generating message to be emailed')
            send_email(create_message(appointment_data))
        else:
            print('No available dates for %s...' % (VISA_CENTRE))

        #TODO: Need to figure out the captcha after submitting the form
        # submit_schedule_appointment_form(schedule_appointment_form_data);

    except Exception as e:
        print('Error checking visa appointment', e)
コード例 #6
0
    def google_search(self, query, num):
        NUM = num
        url = 'https://www.googleapis.com/customsearch/v1?'
        params = {
            'key': self.google_api_key,
            'q': query,
            'cx': '013036536707430787589:_pqjad5hr1a',
            'alt': 'json',
            'lr': 'lang_ja',
        }
        start = 1
        items = []

        for i in range(0, NUM):
            params['start'] = start
            request_url = url + urllib.parse.urlencode(params)
            try:
                response = urllib.request.urlopen(request_url)
                json_body = json.loads(response.read().decode('utf-8'))
                items.extend(json_body['items'])
                if not 'nextPage' in json_body['queries']:
                    break
                start = json_body['queries']['nextPage'][0]['startIndex']
            except:
                items.extend({'link': '#', 'title': '検索できませんでした'})
        pages = []
        for item in items:
            page = WebPage(item['link'])
            page.title = item['title']
            page.snippet = item['snippet']
            pages.append(page)
        return pages  # => [{'link': 'http://...', 'title': 'ページは'}, {...}...]
コード例 #7
0
 def bing_search(self, query, num):
     key = self.microsoft_api_key
     url = 'https://api.datamarket.azure.com/Bing/Search/Web?'
     json_param = '&$format=json'
     param = {'Query': query}
     request_url = url + urllib.parse.urlencode(param) + json_param
     items = []
     for i in range(0, num):
         try:
             json_body = requests.get(request_url,
                                      auth=(key, key),
                                      headers={
                                          'User-Agent': 'My API Robot'
                                      }).json()
             items.extend(json_body['d']['results'])
             request_url = json_body['d']['__next']
         except:
             items.extend({'Url': '#', 'Title': '検索できませんでした'})
     pages = []
     for item in items:
         if type(item) == str:
             continue
         page = WebPage(item['Url'])
         #googleの書き方に統一
         page.title = item['Title']
         page.snippet = item['Description']
         pages.append(page)
     return pages
コード例 #8
0
 def yahoo_key_phrase(self, text):
     url = 'http://jlp.yahooapis.jp/KeyphraseService/V1/extract?appid=%s&sentence=%s' % (
         self.yahoo_japan_app_id, text)
     result_page = WebPage(url)
     result_page.fetch_xml()
     key_phrases = result_page.pick_key_phrases()
     return key_phrases
コード例 #9
0
def get_book_appointment_page_url():
    user_applications_page = WebPage(url="%s%s" %
                                     (UKVISA_BASE_URL, USER_APPLICATIONS_PATH),
                                     handler=s)

    d = pq(user_applications_page.content())
    return "%s%s" % (UKVISA_BASE_URL, d('a').attr('href').replace('..', ''))
コード例 #10
0
def search_in_clueweb_with_expanded_query():
    search_engine = SearchEngine()
    search_engine.action_word = request.form['action_word']
    search_engine.hint_word = request.form['hint_word']
    search_engine.find_related_action_words_with_google()
    search_engine.count_action_words()
    search_engine.sort_action_words_count()
    search_engine.pick_sorted_action_words_more_than_1_count()
    results = []
    for elem in search_engine.sorted_action_words_more_than_1_count:
        elem['expanded_query'] = search_engine.action_word + ' ' + search_engine.hint_word + ' ' + elem['word']
        url = 'http://karen.dl.local:8983/solr/ClueWeb09ja/select?q=' + elem['expanded_query'] + '&wt=xml'
        web_page = WebPage(url)
        web_page.fetch_xml()
        web_page.pick_texts_to_result_pages()
        # クエリ1つごとに結果xmlページがある
        # 結果xmlページの内容を1ページずつWebPageオブジェクトにしてresult_pagesとして1クエリに対応する結果ページに持たせる
        for result_page in web_page.result_pages:
            # result_page.text_body
            result_page.set_lines_from_texts()
            result_page.set_line_nums_with_word(search_engine.action_word)
            result_page.set_line_nums_around_action_word()
            result_page.set_line_clusters_around_action_word()
        # web_page.result_pages[0].line_clusters_around_action_word
        results.append({'pages': web_page.result_pages, 'expanded_query': elem['expanded_query']})
    return render_template('search_in_clueweb_with_expanded_query.tmpl',
        results=results)
コード例 #11
0
def find_matched_words_from_yahoo_ads():
    query = request.form['query']
    #yahooスポンサードサーチは単語ごとに区切るより一文にしたほうが広告出やすい
    head = 'http://search.yahoo.co.jp/search/ss?p='
    tail = '&ei=UTF-8&fr=top_ga1_sa&type=websearch&x=drt'
    url = head + query + tail
    y_ad_page = WebPage(url)
    y_ad_page.fetch_html()
    y_ad_page.fetch_ads()
    naradeha_results = []
    bracket_words = []
    for ad in y_ad_page.ads:
        ad.fetch_link_title()
        naradeha_results.extend(ad.pick_characteristic_words())
        bracket_words.extend(ad.pick_bracket_words())
    # naradeharesults => [{'なら': {'before': ['。', 'あの', '今石洋之']}}]
    # bracket_words => ['アスコルビン酸', 'メルトダウン']

    stop_words = ['公式', '楽天', '当日', 'お急ぎ便', 'ココ', 'ここ', 'これ', 'コレ', 'こちら', '公式', '購入', '人気', '詳細', '送料無料', '配送無料', '価格', '激安', '無料', 'アマゾン', 'ヤフオク', '0', '1', '2', '3']
    for num in range(0, 10):
        stop_words.append(str(num))
    results = naradeha_words_to_results(naradeha_results, stop_words)

    for bracket_word in bracket_words:
        is_including_stop_word = False
        for stop_word in stop_words:
            if stop_word in bracket_word:
                is_including_stop_word = True
                break
        if is_including_stop_word:
            continue
        results.append(bracket_word)

    return render_template('words.tmpl', words=results)
コード例 #12
0
 def bing_search(self, query, num):
     NUM = num
     key = self.microsoft_api_key
     url = 'https://api.datamarket.azure.com/Bing/Search/Web?'
     json_param = '&$format=json'
     param = {
         'Query': "'" + query + "'",
     }
     req_url = url + urllib.parse.urlencode(param)
     items = []
     for i in range(0, NUM):
         try:
             json_body = requests.get(req_url + json_param,
                                      auth=(key, key)).json()
             items.extend(json_body['d']['results'])
             req_url = json_body['d']['__next']
         except:
             items.extend({'Url': '#', 'Title': '検索できませんでした'})
     pages = []
     for item in items:
         page = WebPage(item['Url'])
         #googleの書き方に統一
         page.title = item['Title']
         page.snippet = item['Description']
         pages.append(page)
     return pages
コード例 #13
0
 def find_related_action_words_from_clueweb(self):
     self.set_solr_query()
     texts = self.clue_web_search(self.solr_query)
     # texts => ['大学入学', 'aaaa', ... ] 20
     for text in texts:
         page = WebPage('unknown')
         self.add_to_results_if_key_phrase_present(text, page)
コード例 #14
0
 def test_set_line_nums_around_action_word(self):
     result_page = WebPage()
     result_page.lines = ['aa', 'bbbb', 'ccccc', 'ddddd', 'aaaaa', 'eeeee']
     result_page.set_line_nums_with_word('a')
     result_page.set_line_nums_around_action_word()
     self.assertEqual(result_page.line_nums_around_action_word,
                      set([0, 1, 3, 4, 5]))
コード例 #15
0
 def test_mashou_sentence(self):
     page = WebPage('http://home.e05.itscom.net/mizuki/masako/bedmake.htm')
     page.text = '1.トイレの便座も一度拭きましょう!'
     page.set_sentences_from_text()
     page.set_tasks_from_sentences()
     task = page.tasks[0]
     self.assertEqual(task.object_term.name, 'トイレの便座')
     self.assertEqual(task.predicate_term, '拭く')
コード例 #16
0
def load_html_files_with_query(query):
    pages = []
    for i in range(constants.NUM_OF_FETCHED_PAGES):
        with open('%s_%s.html' % (query, str(i)), 'r') as f:
            page = WebPage()
            page.html_body = f.read()
        pages.append(page)
    return pages
コード例 #17
0
    def test_combine_nouns(self):
        page = WebPage()
        m_words = page.to_m_words('親子決戦試合')
        results = page.combine_nouns(m_words)
        self.assertEqual(results[0].name, '親子決戦試合')

        m_words = page.to_m_words('そして勝敗決定戦に')
        results = page.combine_nouns(m_words)
        self.assertEqual(results[1].name, '勝敗決定戦')
コード例 #18
0
 def test_set_descendants_with_simple_html(self):
     h1_node = Node('bbbbb<h2>cccc</h2>dddd', 'h1')
     h1_node.set_descendants()
     self.assertEqual(h1_node.children[0].html_body, 'dddd')
     self.assertEqual(h1_node.children[0].heading_type, 'h2')
     self.assertEqual(h1_node.children[0].heading_title, 'cccc')
     self.assertEqual(len(h1_node.children), 1)
     self.naver_hay_fever_page = WebPage(
         'http://matome.naver.jp/topic/1LzuV')
コード例 #19
0
 def test_set_clusters_around_action_word(self):
     result_page = WebPage()
     result_page.lines = [
         'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l'
     ]
     result_page.line_nums_around_action_word = set([0, 1, 3, 4, 5, 9, 10])
     result_page.set_line_clusters_around_action_word()
     self.assertEqual(result_page.line_clusters_around_action_word,
                      [['a', 'b'], ['d', 'e', 'f'], ['j', 'k']])
コード例 #20
0
ファイル: utils.py プロジェクト: NIGAYIM/jikken4
def load_html_files_with_query(query):
    pages = []
    for i in range(1):
        with open('%s.txt' % (query), 'r') as f:  #ファイルの形式変更
            page = WebPage()
            page.html_body = f.read()
        page.remove_html_tags()
        pages.append(page)
    return pages
コード例 #21
0
def scrape_from_nanapi_and_build_heading_tree():
    query = request.form['query']
    head = 'http://nanapi.jp/search/q:'
    query_url = head + query
    nanapi_search_result_page = WebPage(query_url)
    nanapi_search_result_page.fetch_html()
    urls = nanapi_search_result_page.find_urls_from_nanapi_search_result()
    results = []
    for url in urls:
        # result_pageはnanapiの1記事
        result_page = WebPage(url)
        result_page.fetch_html()
        result_page.set_title()
        # task_steps => [task_step, task_step, ...]
        result_page.build_heading_tree()
        result = {'title': result_page.title, 'nodes': result_page.top_nodes, 'url': result_page.url}
        results.append(result)
    return render_template('headings_and_li_texts.tmpl', results=results)
コード例 #22
0
def scrape_from_nanapi():
    query = request.form['query']
    head = 'http://nanapi.jp/search/q:'
    query_url = head + query
    nanapi_search_result_page = WebPage(query_url)
    nanapi_search_result_page.fetch_html()
    urls = nanapi_search_result_page.find_urls_from_nanapi_search_result()
    tasks = []
    for url in urls:
        # result_pageはnanapiの1記事
        result_page = WebPage(url)
        result_page.fetch_html()
        # task_steps => [task_step, task_step, ...]
        task = result_page.find_task_from_nanapi_with_headings()
        # task_steps[0].h2 => 'はじめに'
        # task_steps[0].h3s[0] => 'はじめに'
        tasks.append(task)
    # tasks => [task, task, ...]
    # tasks[0][0].h2 => 'はじめに'
    return render_template('nanapi_tasks.tmpl', tasks=tasks)
コード例 #23
0
 def test_05_web(self):
     """Baidu.com"""
     switch_to_native(self.driver)
     web_po = WebPage(self.driver)
     web_po.baidu_button()
     sleep(3)
     insert_img(self.driver, 'baidu')
     switch_to_webview(self.driver)
     web_po.search_input('macaca')
     sleep(3)
     web_po.search_button()
コード例 #24
0
def load_html_files():
    """
    HTMLファイルがあるディレクトリにいる前提で使う
    """
    pages = []
    for i in range(constants.NUM_OF_FETCHED_PAGES):
        with open('%s.txt' % (constants.QUERY, str(i)), 'r') as f:
            page = WebPage()
            page.html_body = f.read()
        page.remove_html_tags()
        pages.append(page)
    return pages
コード例 #25
0
 def clue_web_search(self, query):
     """
     検索するだけ。
     rows=50の値を変えることで検索結果件数を変えられる。
     返り値にlistでtextsを渡す
     """
     options = '&rows=50'
     url = constants.CLUE_WEB_URL_HEAD + query + options + constants.CLUE_WEB_URL_TAIL
     clue_web_result_page = WebPage(url)
     clue_web_result_page.fetch_xml()
     clue_web_result_page.pick_texts()
     return clue_web_result_page.texts
コード例 #26
0
    def test_part_of_task_clusters(self):
        page = WebPage(url='somewhere', query='カメラ 買う')
        page.text = 'ヨドバシカメラに行く必要があります。お金を払ってください。' \
                    'ヨドバシカメラに行く必要があります。お金を払ってください。'
        graph = self.build_graph([page])
        answerer = TaskGraphFirstAnswerer(graph=graph, query_task='カメラ_買う')
        answerer.set_result_tasks()
        p_clusters = answerer.part_of_task_clusters
        self.assertEqual(p_clusters, [{'お金_払う', 'ヨドバシカメラ_行く'}])

        i_clusters = answerer.instance_of_task_clusters
        self.assertEqual(i_clusters, [])
コード例 #27
0
 def result_pages(self, page_num=50):
     items = self._search(page_num)
     pages = []
     for item in items:
         if type(item) == str:
             continue
         page = WebPage(item['Url'])
         page.query = self.query
         #googleの書き方に統一
         page.title = item['Title']
         page.snippet = item['Description']
         pages.append(page)
     return pages
コード例 #28
0
 def bing_search(self):
     key = my_keys.MICROSOFT_API_KEY
     bing = Bing(key)
     items = bing.web_search(self.query, 50, ['Title', 'Url', 'Description'])
     pages = []
     for item in items:
         if type(item) == str:
             continue
         page = WebPage(item['Url'])
         page.query = self.query
         #googleの書き方に統一
         page.title = item['Title']
         page.snippet = item['Description']
         pages.append(page)
     return pages
コード例 #29
0
def yahoo_sponsored_results():
    query = request.forms.decode().get('query')
    #yahooスポンサードサーチは単語ごとに区切るより一文にしたほうが広告出やすい
    head = 'http://search.yahoo.co.jp/search/ss?p='
    tail = '&ei=UTF-8&fr=top_ga1_sa&type=websearch&x=drt'
    url = head + query + tail
    y_ad_page = WebPage(url)
    ads = y_ad_page.fetch_ads()
    v_and_s = []
    for ad in ads:
        v_and_s.extend(ad.pick_verbs(ad.title))
        v_and_s.extend(ad.pick_sahens(ad.title))
        v_and_s.extend(ad.pick_verbs(ad.snippet))
        v_and_s.extend(ad.pick_sahens(ad.snippet))
    results = to_ranked_items(v_and_s)
    return ad_template.render(items=results)
コード例 #30
0
    def test_slice_after_dots(self):
        page = WebPage()
        sentence_with_dots = 'あいうえお、かきくけこさしすせそ'
        result = page.slice_after_dots(sentence_with_dots)
        self.assertEqual(result, 'かきくけこさしすせそ')

        sentence_with_dots_2 = 'あいうえお、かきくけこ、さしすせそ'
        result = page.slice_after_dots(sentence_with_dots_2)
        self.assertEqual(result, 'さしすせそ')

        sentence_with_dots_3 = 'あいうえお、かきくけこ。さしすせそ'
        result = page.slice_after_dots(sentence_with_dots_3)
        self.assertEqual(result, 'さしすせそ')

        sentence_with_dots_4 = 'あいうえお。かきくけこ、さしすせそ'
        result = page.slice_after_dots(sentence_with_dots_4)
        self.assertEqual(result, 'さしすせそ')