def register_query_pages(query_id, query_str, cursor, ntcir_urls_folder, ntcir_htmls_folder, cache_folder): cache_files_folder = "%s/pages" % (cache_folder,) qid = web_search.add_query(cursor, query_str, 'NTCIR') rank = 0 for file_name in file("%s/%s.MAND.tsv" % (ntcir_urls_folder, query_id,)): full_name = "%s/%s" % (ntcir_htmls_folder, file_name.strip()) url = "file://%s" % (full_name,) rid = web_search.copy_existing_page(cursor, cache_files_folder, full_name, url) web_search.ensure_page_query_link(cursor, cache_files_folder, qid, rank, url) rank += 1
def register_query_pages(query_id, query_str, cursor, ntcir_urls_folder, ntcir_htmls_folder, cache_folder): cache_files_folder = "%s/pages" % (cache_folder, ) qid = web_search.add_query(cursor, query_str, 'NTCIR') rank = 0 for file_name in file("%s/%s.MAND.tsv" % ( ntcir_urls_folder, query_id, )): full_name = "%s/%s" % (ntcir_htmls_folder, file_name.strip()) url = "file://%s" % (full_name, ) rid = web_search.copy_existing_page(cursor, cache_files_folder, full_name, url) web_search.ensure_page_query_link(cursor, cache_files_folder, qid, rank, url) rank += 1
(conn, cursor) = web_search.open_db(cache_folder) if web_search.find_query(cursor, query_str, search_engine) is not None: sys.exit("Query already in index") cache_files_folder = "%s/pages" % (cache_folder, ) print "page cache folder", cache_files_folder try: os.mkdir(cache_files_folder) print "creating page cache folder", cache_files_folder except object as exc: print "(warning) problem creating", cache_files_folder, exc except OSError: pass qid = web_search.add_query(cursor, query_str, search_engine) rank = 0 found = set() for url in file(html_file): url = url.replace('\n', '') if url in found: continue found.add(url) web_search.ensure_page_query_link(cursor, cache_files_folder, qid, rank, url) conn.commit() rank += 1 conn.close()
(conn, cursor) = web_search.open_db(cache_folder) if web_search.find_query(cursor, query_str, search_engine) is not None: sys.exit("Query already in index") cache_files_folder = "%s/pages" % (cache_folder,) print "page cache folder", cache_files_folder try: os.mkdir(cache_files_folder) print "creating page cache folder", cache_files_folder except object as exc: print "(warning) problem creating", cache_files_folder, exc except OSError: pass qid = web_search.add_query(cursor, query_str, search_engine) rank = 0 found = set() for url in file(html_file): url = url.replace("\n", "") if url in found: continue found.add(url) web_search.ensure_page_query_link(cursor, cache_files_folder, qid, rank, url) conn.commit() rank += 1 conn.close()