def find_matched_words_from_yahoo_ads(): query = request.form['query'] #yahooスポンサードサーチは単語ごとに区切るより一文にしたほうが広告出やすい head = 'http://search.yahoo.co.jp/search/ss?p=' tail = '&ei=UTF-8&fr=top_ga1_sa&type=websearch&x=drt' url = head + query + tail y_ad_page = WebPage(url) y_ad_page.fetch_html() y_ad_page.fetch_ads() naradeha_results = [] bracket_words = [] for ad in y_ad_page.ads: ad.fetch_link_title() naradeha_results.extend(ad.pick_characteristic_words()) bracket_words.extend(ad.pick_bracket_words()) # naradeharesults => [{'なら': {'before': ['。', 'あの', '今石洋之']}}] # bracket_words => ['アスコルビン酸', 'メルトダウン'] stop_words = ['公式', '楽天', '当日', 'お急ぎ便', 'ココ', 'ここ', 'これ', 'コレ', 'こちら', '公式', '購入', '人気', '詳細', '送料無料', '配送無料', '価格', '激安', '無料', 'アマゾン', 'ヤフオク', '0', '1', '2', '3'] for num in range(0, 10): stop_words.append(str(num)) results = naradeha_words_to_results(naradeha_results, stop_words) for bracket_word in bracket_words: is_including_stop_word = False for stop_word in stop_words: if stop_word in bracket_word: is_including_stop_word = True break if is_including_stop_word: continue results.append(bracket_word) return render_template('words.tmpl', words=results)
def yahoo_sponsored_results(): query = request.form['query'] #yahooスポンサードサーチは単語ごとに区切るより一文にしたほうが広告出やすい head = 'http://search.yahoo.co.jp/search/ss?p=' tail = '&ei=UTF-8&fr=top_ga1_sa&type=websearch&x=drt' url = head + query + tail y_ad_page = WebPage(url) y_ad_page.fetch_html() y_ad_page.fetch_ads() result_words = [] key_phrases_of_ads = [] Engine = SearchEngine() for ad in y_ad_page.ads: result_words.extend(ad.pick_nouns_and_verbs(ad.title)) result_words.extend(ad.pick_nouns_and_verbs(ad.snippet)) #key_phrases_of_ads.append(Engine.yahoo_key_phrase(ad.title)) #key_phrases_of_ads.append(Engine.yahoo_key_phrase(ad.snippet)) results = to_ranked_items(result_words) #return ad_template.render(items=results) return render_template('find_words_with_yahoo_ads.tmpl', items=results)
def yahoo_sponsored_results(): query = request.forms.decode().get('query') #yahooスポンサードサーチは単語ごとに区切るより一文にしたほうが広告出やすい head = 'http://search.yahoo.co.jp/search/ss?p=' tail = '&ei=UTF-8&fr=top_ga1_sa&type=websearch&x=drt' url = head + query + tail y_ad_page = WebPage(url) ads = y_ad_page.fetch_ads() v_and_s = [] for ad in ads: v_and_s.extend(ad.pick_verbs(ad.title)) v_and_s.extend(ad.pick_sahens(ad.title)) v_and_s.extend(ad.pick_verbs(ad.snippet)) v_and_s.extend(ad.pick_sahens(ad.snippet)) results = to_ranked_items(v_and_s) return ad_template.render(items=results)
def query_expansion(): query = request.forms.decode().get('query') #yahooスポンサードサーチは単語ごとに区切るより一文にしたほうが広告出やすい head = 'http://search.yahoo.co.jp/search/ss?p=' tail = '&ei=UTF-8&fr=top_ga1_sa&type=websearch&x=drt' url = head + query + tail y_ad_page = WebPage(url) ads = y_ad_page.fetch_ads() v_and_s = [] for ad in ads: v_and_s.extend(ad.pick_verbs(ad.title)) v_and_s.extend(ad.pick_sahens(ad.title)) v_and_s.extend(ad.pick_verbs(ad.snippet)) v_and_s.extend(ad.pick_sahens(ad.snippet)) ranked_items = to_ranked_items(v_and_s) ranked_items.insert(0, {'name': 'まとめ', 'count': 100}) normalized_query = normalize_query(query) query_words = normalized_query.split(' ') page_set = set() # set型は重複をなくすため expanded_queries = [] for item in ranked_items: top_5 = [] if item['name'] in query_words: #'花粉症 対策'で検索したら'対策'がitem['name']に入っていたりする continue else: expanded_query = normalized_query + ' ' + item['name'] expanded_queries.append(expanded_query) new_pages = search(expanded_query, 1) top_5 = new_pages[0:4] over_19 = False for one in top_5: page_set.add(one) if len(page_set) > 19: over_19 = True break if over_19 is True: break # => ranked_items内で回すのから脱出 return expand_template.render(pages=page_set, queries=expanded_queries)