Example #1
0
def exe_extract_text(judge_path, index_path, text_db_path):
    judge_file = QRelFile(judge_path);
    docnos = judge_file.key2s();
    docnos = filter(is_cluewebB, docnos);
    #docnos = docnos[:1000];
    print 'doc number:', len(docnos);
    db = bsddb.hashopen(text_db_path, 'w');
    count = 0;
    texts = fastmap.fastmap(lambda docno: extract_text(docno, index_path), 30, docnos);
    assert len(docnos) == len(texts);
    for i in xrange(len(docnos)): 
        db[docnos[i]] = texts[i];
    db.close();
Example #2
0
def find_nuggets(ini, htmls, query_str):
    tmp_folder = ini.get("tmp_folder", "./tmp")

    ####
    # extract text from the HTML documents
    #
    sys.stderr.write("Extracting text...\n")
    path_to_corpus = "%s/to_index" % (tmp_folder,)
    if not os.path.exists(path_to_corpus):
        os.makedirs(path_to_corpus)

    html_count = 0
    for html in htmls:
        outfile = "%s/%s.txt" % (path_to_corpus, html_count)
        cached_detag = "%s.txt" % (html,)
        if os.path.exists(cached_detag):
            copyfile(cached_detag, outfile)
        else:
            detag_html_file(infile=html, outfile=outfile, id=html_count)
            copyfile(outfile, cached_detag)
        html_count += 1

    ####
    # build index
    #
    sys.stderr.write("Indexing...\n")
    path_to_index = "%s/index" % (tmp_folder,)
    if not os.path.exists(path_to_index):
        os.makedirs(path_to_index)

    config_template = file(ini.get("index_config_template", "./indexing.template")).read()
    config_filename = "%s/indexing.param" % (tmp_folder,)
    config_file = open(config_filename, "w")
    config_file.write(config_template.format(path_to_corpus=path_to_corpus, path_to_index=path_to_index))
    config_file.close()
    index_command = ini.get("index_command", "IndriBuildIndex")

    retcode = subprocess.call([index_command, config_filename], stdout=sys.stderr, stderr=sys.stderr)
    assert retcode == 0

    ####
    # generate query
    #
    parsed_query = parse_into_chunks(query_str)

    if bool(ini.get("condition_baseline", "")):
        print "baseline run."
        return ([], parsed_query, path_to_index)

    ####
    # main search
    #
    sys.stderr.write("Main search...\n")
    search_command = ini.get("search_command", "./cpp/Search")
    main_passages = do_search(parsed_query, search_command, path_to_index, 2000)

    ####
    # identify candidates
    #
    sys.stderr.write("Identifying candidates...\n")
    top_documents = int(ini.get("top_documents_for_candidate", "20"))
    candidates, main_evidence = identify_candidates(
        main_passages, int(ini.get("main_search_passage_count", 3)), top_documents
    )

    ###
    # evidence search
    #
    sys.stderr.write("Evidence searching...\n")
    evidence = dict()
    # for candidate in candidates:
    # extended_query = list(parsed_query)
    # extended_query.append( ('NE', candidate[1] ) )
    # evidence_passages = do_search(extended_query, search_command, path_to_index,
    # int(ini.get('evidence_search_passage_count', 10)))
    # evidence[candidate[0]] = filter(lambda passage:
    # all(map(lambda token: token.lower() in passage[1].lower(),
    # candidate[1])), evidence_passages)
    # sys.stderr.write('Found %d passages\n' % (len(evidence[candidate[0]]),))

    searcher = Searcher(search_command, path_to_index, int(ini.get("evidence_search_passage_count", 10)))
    print "candidate num:%d" % len(candidates)
    queries = map(lambda candidate: list(parsed_query) + [("NE", candidate[1])], candidates)
    evidence_passages_list = fastmap.fastmap(searcher, 10, queries)
    for i in xrange(len(candidates)):
        candidate = candidates[i]
        evidence[candidate[0]] = filter(
            lambda passage: all(map(lambda token: token.lower() in passage[1].lower(), candidate[1])),
            evidence_passages_list[i],
        )

    ####
    # evaluate evidence
    #
    sys.stderr.write("Evaluating evidence...\n")
    scored_candidates = list()

    if USE_CANDIDATE_SCORER:
        pool_scorer = PoolScorer(ini)
        p = Pool(8)
        scorerd_candidates = fastmap.fastmap(
            pool_scorer,
            map(
                lambda candidate: (candidate, evidence[candidate], main_evidence[candidate], parsed_query),
                evidence.keys(),
            ),
        )
        p.close()
    else:
        for candidate in evidence:
            scored_candidates.append(
                (
                    candidate,
                    score_candidate(candidate, evidence[candidate], main_evidence[candidate], parsed_query),
                    evidence[candidate],
                )
            )
    ####
    # clean up
    #
    if False:
        for i in xrange(0, html_count):
            try:
                os.unlink("%s/to_index/%s.txt" % (tmp_folder, i))
            except:
                pass

    ####
    # show candidates
    #
    if False:
        scored_candidates.sort(key=itemgetter(1), reverse=True)
        rank = 0
        for candidate_score in scored_candidates:
            candidate, score, evidence = candidate_score
            print candidate
            print "\t", rank, score
            # printed = set()
            # for entry in evidence:
            #    if not entry[0]['document'] in printed:
            #        print entry[0]['document'], entry[0]['score']
            #        printed.add(entry[0]['document'])
            #        print ""
            rank += 1

    return (scored_candidates, parsed_query, path_to_index)
Example #3
0
def generate_for_day(df: pd.DataFrame,
                     dt=None,
                     tickers=None,
                     rounds=None) -> ListOfDicts:
    """
    Given the input_csv, and the number of round, train <rounds> times and bag the results.
    """

    if dt:
        logging.info(
            f"Training {rounds} rounds with {len(df)} stocks in dataset for {dt}..."
        )
    else:
        logging.info(
            f"Training {rounds} rounds with {len(df)} stocks in dataset...")

    if len(df) < MIN_TRAINING_SIZE:
        logging.warning(
            f"Skipping because the number of stocks < {MIN_TRAINING_SIZE}...")
        return []

    dt_str = dt and dt.strftime("%Y-%m-%d")
    df = df[(df.date == dt_str)]

    calculate_overall_r2(df, rounds)

    raw_estimate_by_ticker = {}

    if tickers:
        unavailable_tickers = set(tickers) - set(list(df['ticker']))
        for ticker in unavailable_tickers:
            logging.warning(
                f"Ticker '{ticker}' not found in dataset. Continuing...")
            tickers.remove(ticker)
    else:
        tickers = set(list(df['ticker']))
    logging.info(f"Building models for {len(tickers)} stocks.")

    def _gfds(ticker):
        return ticker, generate_ffe_for_ticker(ticker, df=df, rounds=rounds)

    for ticker, estimate in fastmap.fastmap(_gfds, list(tickers)):
        market_cap = float(df[(df.ticker == ticker)]['marketCap'])
        logging.info("FFER for %r: %.4f", ticker, market_cap / estimate)
        raw_estimate_by_ticker[ticker] = estimate

    estimates = []
    for ticker, pred in raw_estimate_by_ticker.items():
        df_row = df[(df.ticker == ticker)].to_dict(orient='records')[0]
        row = {}
        if dt_str:
            row["date"] = dt_str
        row["ticker"] = ticker
        row["num_shares"] = df_row["shares"]
        row["actual_market_cap"] = df_row['marketCap']
        row["estimate_market_cap"] = pred
        row["actual_price"] = df_row["price"]
        row["estimate_price"] = row["estimate_market_cap"] / row["num_shares"]
        row["ffer"] = row["actual_price"] / row["estimate_price"]
        for col in X_DIMENSIONS:
            row[col] = df_row[col]
        estimates.append(row)

    return sorted(estimates, key=lambda p: p["ffer"])