# -*- coding: utf-8 -*- import constants import pdb from pickle_file_saver_for_original import PickleFileSaverForOriginal from page_data_loader import PageDataLoader from web_page import WebPage if __name__ == '__main__': queries = constants.QUERIES_4 saver = PickleFileSaverForOriginal() with PageDataLoader() as page_loader: for query in queries: pages = [] page_ids = page_loader.page_ids_with_query(query) for page_id in page_ids: pagedata = page_loader.pagedata_with_id( page_id) # (id, url, snippet, body, rank) page = WebPage(id=page_id, url=pagedata[0], query=pagedata[1], snippet=pagedata[2], rank=pagedata[3]) pages.append(page) saver.save_pages_with_query(pages=pages, query=query)
# -*- coding: utf-8 -*- import constants import pdb from bing_searcher import BingSearcher from pickle_file_saver_for_original import PickleFileSaverForOriginal if __name__ == '__main__': queries = constants.QUERIES_4 saver = PickleFileSaverForOriginal() for query in queries: if saver.can_find_page_with_query(query): print('%sはもうあります' % query) continue bs = BingSearcher(query) pages = bs.result_pages(page_num=1000) # len(pages)が1000ないこともある saver.save_pages_with_query(pages=pages, query=query)
# -*- coding: utf-8 -*- from pickle_file_loader_for_original import PickleFileLoaderForOriginal from pickle_file_saver_for_original import PickleFileSaverForOriginal from page_data_loader import PageDataLoader import constants from sentence import Sentence import pdb if __name__ == '__main__': queries = constants.QUERIES_4 for query in queries: pfl = PickleFileLoaderForOriginal() pages = pfl.load_fetched_pages_with_query(query) for i, page in enumerate(pages): with PageDataLoader() as page_loader: sentences = page_loader.sentences_with_id(page.id) page.sentences = [] for sentence in sentences: page.sentences.append(Sentence(sentence, page.query)) page.set_tasks_from_sentences() print('%s の %i 番目のページにtasksをセットしました!' % (page.query, i)) pfs = PickleFileSaverForOriginal() pfs.save_pages_with_query(pages=pages, query=query)