コード例 #1
0
def do_gen_nugget_train(ini_path):
    from nugget_finder import load_ini, do_search, identify_candidates
    from one_click_search import query_web_search
    
    ini = load_ini(ini_path)
    nugget_finder.USE_PATTERNS = True
    if bool(ini.get('condition_no_cclparser', '')) or \
           bool(ini.get('condition_baseline', '')):
        parser.USE_CCLPARSER = False

    if bool(ini.get('condition_no_boilerplate', '')) or \
           bool(ini.get('condition_baseline', '')):
        html_to_trec.USE_BOILERPLATE = False

    records = read_groundtruth(ini.get('ground_truth'))
    writer = open(ini.get('train_path'), 'w')
    for query_str, good_text in records:
        tmp_folder = ini.get('tmp_folder', '/tmp')
        print 'query:', query_str
        (htmls, html_urls) = query_web_search(query_str, ini)
        print "found", len(htmls), "pages"
        instances = gen_nugget_train(ini, htmls, query_str, good_text)
        for candidate, is_good, features in instances:
            writer.write('%d,%s#%s\n' % (is_good, ','.join(map(lambda feature: str(feature), features)), '%s:%s' % (query_str, candidate)))
        writer.flush()
        try:
            rmtree(tmp_folder)
            os.mkdir(tmp_folder)
        except Exception as e:
            print e
    writer.close()
コード例 #2
0
def do_gen_nugget_train(ini_path):
    from nugget_finder import load_ini, do_search, identify_candidates
    from one_click_search import query_web_search

    ini = load_ini(ini_path)
    nugget_finder.USE_PATTERNS = True
    if bool(ini.get('condition_no_cclparser', '')) or \
           bool(ini.get('condition_baseline', '')):
        parser.USE_CCLPARSER = False

    if bool(ini.get('condition_no_boilerplate', '')) or \
           bool(ini.get('condition_baseline', '')):
        html_to_trec.USE_BOILERPLATE = False

    records = read_groundtruth(ini.get('ground_truth'))
    writer = open(ini.get('train_path'), 'w')
    for query_str, good_text in records:
        tmp_folder = ini.get('tmp_folder', '/tmp')
        print 'query:', query_str
        (htmls, html_urls) = query_web_search(query_str, ini)
        print "found", len(htmls), "pages"
        instances = gen_nugget_train(ini, htmls, query_str, good_text)
        for candidate, is_good, features in instances:
            writer.write('%d,%s#%s\n' % (is_good, ','.join(
                map(lambda feature: str(feature), features)), '%s:%s' %
                                         (query_str, candidate)))
        writer.flush()
        try:
            rmtree(tmp_folder)
            os.mkdir(tmp_folder)
        except Exception as e:
            print e
    writer.close()
コード例 #3
0
ファイル: ntcir.py プロジェクト: Big-Data/hunter-gatherer
    for file_name in file("%s/%s.MAND.tsv" % (ntcir_urls_folder, query_id,)):
        full_name = "%s/%s" % (ntcir_htmls_folder, file_name.strip())
        url = "file://%s" % (full_name,)
        rid = web_search.copy_existing_page(cursor, cache_files_folder, full_name, url)
        web_search.ensure_page_query_link(cursor, cache_files_folder, qid, rank, url)
        rank += 1

if __name__ == '__main__':
    ####
    # input: ini file and queries file
    #
    ini_file = sys.argv[1]
    query_file = sys.argv[2]
    run_number = int(sys.argv[3])

    ini = load_ini(ini_file)
    queries = load_queries(query_file)

    ini['search_engine'] = 'NTCIR'

    # these entries must be defined in the ini file
    system_description = ini['ntcir_system_description'] 
    team_name = ini['ntcir_team_name']
    ntcir_urls_folder = ini['ntcir_urls_folder']
    ntcir_htmls_folder = ini['ntcir_htmls_folder']

    base_tmp = ini.get('tmp_folder', './tmp')

    cache_folder = ini.get('cache_folder',  "%s/cache" % (base_tmp,))
    (conn, cursor) = web_search.open_db(cache_folder)
コード例 #4
0
            results[output_type] = assemble_output(final_passages_scored,
                                                   final_length)

    return (results, html_urls)


if __name__ == '__main__':
    ####
    # input: ini file and query
    #
    import time
    t0 = time.time()
    ini_file = sys.argv[1]
    query_str = " ".join(sys.argv[2:])

    ini = load_ini(ini_file)

    (results, html_urls) = one_click_search(ini, query_str, [(1000, 'DESKTOP'),
                                                             (140, 'TWITTER'),
                                                             (280, 'MOBILE')])

    tmp_folder = ini.get('tmp_folder', './tmp')
    output_file = "%s/out" % (tmp_folder, )
    output = file(output_file, 'w')

    for output_type in results:
        output_text = results[output_type][0]
        output.write("<%s>%s</%s>\n" % (output_type, output_text, output_type))

    output.close()
    print 'time:', time.time() - t0