def exe_extract_text(judge_path, index_path, text_db_path): judge_file = QRelFile(judge_path); docnos = judge_file.key2s(); docnos = filter(is_cluewebB, docnos); #docnos = docnos[:1000]; print 'doc number:', len(docnos); db = bsddb.hashopen(text_db_path, 'w'); count = 0; texts = fastmap.fastmap(lambda docno: extract_text(docno, index_path), 30, docnos); assert len(docnos) == len(texts); for i in xrange(len(docnos)): db[docnos[i]] = texts[i]; db.close();
def find_nuggets(ini, htmls, query_str): tmp_folder = ini.get("tmp_folder", "./tmp") #### # extract text from the HTML documents # sys.stderr.write("Extracting text...\n") path_to_corpus = "%s/to_index" % (tmp_folder,) if not os.path.exists(path_to_corpus): os.makedirs(path_to_corpus) html_count = 0 for html in htmls: outfile = "%s/%s.txt" % (path_to_corpus, html_count) cached_detag = "%s.txt" % (html,) if os.path.exists(cached_detag): copyfile(cached_detag, outfile) else: detag_html_file(infile=html, outfile=outfile, id=html_count) copyfile(outfile, cached_detag) html_count += 1 #### # build index # sys.stderr.write("Indexing...\n") path_to_index = "%s/index" % (tmp_folder,) if not os.path.exists(path_to_index): os.makedirs(path_to_index) config_template = file(ini.get("index_config_template", "./indexing.template")).read() config_filename = "%s/indexing.param" % (tmp_folder,) config_file = open(config_filename, "w") config_file.write(config_template.format(path_to_corpus=path_to_corpus, path_to_index=path_to_index)) config_file.close() index_command = ini.get("index_command", "IndriBuildIndex") retcode = subprocess.call([index_command, config_filename], stdout=sys.stderr, stderr=sys.stderr) assert retcode == 0 #### # generate query # parsed_query = parse_into_chunks(query_str) if bool(ini.get("condition_baseline", "")): print "baseline run." return ([], parsed_query, path_to_index) #### # main search # sys.stderr.write("Main search...\n") search_command = ini.get("search_command", "./cpp/Search") main_passages = do_search(parsed_query, search_command, path_to_index, 2000) #### # identify candidates # sys.stderr.write("Identifying candidates...\n") top_documents = int(ini.get("top_documents_for_candidate", "20")) candidates, main_evidence = identify_candidates( main_passages, int(ini.get("main_search_passage_count", 3)), top_documents ) ### # evidence search # sys.stderr.write("Evidence searching...\n") evidence = dict() # for candidate in candidates: # extended_query = list(parsed_query) # extended_query.append( ('NE', candidate[1] ) ) # evidence_passages = do_search(extended_query, search_command, path_to_index, # int(ini.get('evidence_search_passage_count', 10))) # evidence[candidate[0]] = filter(lambda passage: # all(map(lambda token: token.lower() in passage[1].lower(), # candidate[1])), evidence_passages) # sys.stderr.write('Found %d passages\n' % (len(evidence[candidate[0]]),)) searcher = Searcher(search_command, path_to_index, int(ini.get("evidence_search_passage_count", 10))) print "candidate num:%d" % len(candidates) queries = map(lambda candidate: list(parsed_query) + [("NE", candidate[1])], candidates) evidence_passages_list = fastmap.fastmap(searcher, 10, queries) for i in xrange(len(candidates)): candidate = candidates[i] evidence[candidate[0]] = filter( lambda passage: all(map(lambda token: token.lower() in passage[1].lower(), candidate[1])), evidence_passages_list[i], ) #### # evaluate evidence # sys.stderr.write("Evaluating evidence...\n") scored_candidates = list() if USE_CANDIDATE_SCORER: pool_scorer = PoolScorer(ini) p = Pool(8) scorerd_candidates = fastmap.fastmap( pool_scorer, map( lambda candidate: (candidate, evidence[candidate], main_evidence[candidate], parsed_query), evidence.keys(), ), ) p.close() else: for candidate in evidence: scored_candidates.append( ( candidate, score_candidate(candidate, evidence[candidate], main_evidence[candidate], parsed_query), evidence[candidate], ) ) #### # clean up # if False: for i in xrange(0, html_count): try: os.unlink("%s/to_index/%s.txt" % (tmp_folder, i)) except: pass #### # show candidates # if False: scored_candidates.sort(key=itemgetter(1), reverse=True) rank = 0 for candidate_score in scored_candidates: candidate, score, evidence = candidate_score print candidate print "\t", rank, score # printed = set() # for entry in evidence: # if not entry[0]['document'] in printed: # print entry[0]['document'], entry[0]['score'] # printed.add(entry[0]['document']) # print "" rank += 1 return (scored_candidates, parsed_query, path_to_index)
def generate_for_day(df: pd.DataFrame, dt=None, tickers=None, rounds=None) -> ListOfDicts: """ Given the input_csv, and the number of round, train <rounds> times and bag the results. """ if dt: logging.info( f"Training {rounds} rounds with {len(df)} stocks in dataset for {dt}..." ) else: logging.info( f"Training {rounds} rounds with {len(df)} stocks in dataset...") if len(df) < MIN_TRAINING_SIZE: logging.warning( f"Skipping because the number of stocks < {MIN_TRAINING_SIZE}...") return [] dt_str = dt and dt.strftime("%Y-%m-%d") df = df[(df.date == dt_str)] calculate_overall_r2(df, rounds) raw_estimate_by_ticker = {} if tickers: unavailable_tickers = set(tickers) - set(list(df['ticker'])) for ticker in unavailable_tickers: logging.warning( f"Ticker '{ticker}' not found in dataset. Continuing...") tickers.remove(ticker) else: tickers = set(list(df['ticker'])) logging.info(f"Building models for {len(tickers)} stocks.") def _gfds(ticker): return ticker, generate_ffe_for_ticker(ticker, df=df, rounds=rounds) for ticker, estimate in fastmap.fastmap(_gfds, list(tickers)): market_cap = float(df[(df.ticker == ticker)]['marketCap']) logging.info("FFER for %r: %.4f", ticker, market_cap / estimate) raw_estimate_by_ticker[ticker] = estimate estimates = [] for ticker, pred in raw_estimate_by_ticker.items(): df_row = df[(df.ticker == ticker)].to_dict(orient='records')[0] row = {} if dt_str: row["date"] = dt_str row["ticker"] = ticker row["num_shares"] = df_row["shares"] row["actual_market_cap"] = df_row['marketCap'] row["estimate_market_cap"] = pred row["actual_price"] = df_row["price"] row["estimate_price"] = row["estimate_market_cap"] / row["num_shares"] row["ffer"] = row["actual_price"] / row["estimate_price"] for col in X_DIMENSIONS: row[col] = df_row[col] estimates.append(row) return sorted(estimates, key=lambda p: p["ffer"])