out = config["output"] # process queries: queries = extract_trec_million_queries(config["queries"]) queries_text = {} stoplist = set(stopwords.words("english")) if config["stopwords"] else {} qrels_MQ = [] # get only judged queries for file in os.listdir(config["qrels_MQ"]): qrels_MQ += list(read_values(os.path.join(config["qrels_MQ"], file), 0)) qrels_MQ = set(qrels_MQ) q_times = defaultdict(int) print("Pre-process queries %d queries..." % len(qrels_MQ)) logging.info("Pre-process queries %d queries..." % len(qrels_MQ)) for q in tqdm(qrels_MQ): q_text = clean(queries[q], config["stemmer"], stoplist) q_times[q_text] += 1 # queries with duplicate content queries_text[q] = q_text if q_times[q_text] == 1 else ' '.join( [q_text, str(q_times[q_text])]) if q_times[q_text] > 1: logging.info("Duplicated: " + q + '\t' + ' '.join([q_text, str(q_times[q_text])])) # retrieve passages logging.info("Retrieving ...") print("Wait while retrieving passages for different queries ...") temp_parameters = join(config["output"], "temp_Q.xml") for q_id in qrels_MQ: q_txt = queries_text[q_id] logging.info(q_id + '\t' + q_txt) c_root = ET.Element("parameters")
# read the evaluation file, get it at: # https://word2vec.googlecode.com/svn/trunk/questions-words.txt questions = 'questions-words.txt' evals = open(questions, 'r').readlines() num_sections = len([l for l in evals if l.startswith(':')]) print('total evaluation sentences: {} '.format(len(evals) - num_sections)) # total evaluation sentences: 19544 # clean questions print("Preprocessing questions ...") out = open("pre_processed_" + questions, "w") for l in tqdm(evals): if l.startswith(':'): out.write(l) else: pre_processed = clean(l, "krovetz", []) out.write(pre_processed + "\n") questions = "pre_processed_" + questions # load the pre-trained model of GoogleNews dataset (100 billion words), get it at: # https://code.google.com/p/word2vec/#Pre-trained_word_and_phrase_vectors google = Word2Vec.load_word2vec_format( '/home/thiziri/Documents/DOCTORAT/osirim_data/projets/iris/PROJETS/WEIR/collections/constructed/local_embeddings/Robust/Robust_skipgram_wordEmbedding_dim300_win10_minCount5.txt', binary=False) #('GoogleNews-vectors-negative300.bin', binary=True) # test the model accuracy* w2v_model_accuracy(google, questions) #Total sentences: 7614, Correct: 74.26%, Incorrect: 25.74% # *took around 1hr45mins on Mac Book Pro (3.1 GHz Intel Core i7)
if config["train_queries"] == config["test_queries"]: queries = extractTopics(config["train_queries"]) if config["train_queries_format"] == "trec"\ else extract_trec_million_queries(config["train_queries"]) else: train_queries = extractTopics(config["train_queries"]) if config["train_queries_format"] == "trec" \ else extract_trec_million_queries(config["train_queries"]) test_queries = extractTopics(config["test_queries"]) if config["test_queries_format"] == "trec" \ else extract_trec_million_queries(config["test_queries"]) queries = {**train_queries, **test_queries} print("{n} queries to process.".format(n=len(queries))) queries_text = {} q_times = defaultdict(int) print("Preprocess queries ...") for q in tqdm(queries): q_text = clean(queries[q], "krovetz", {}) q_times[q_text] += 1 queries_text[q] = q_text if q_times[q_text] == 1 else ' '.join( [q_text, str(q_times[q_text])]) out_trec_f = join(config["output_folder"], "trec_corpus.txt") out_t = codecs.open(out_trec_f, "w", encoding='utf8') print("Collection2Text ...") nl = 0 relations = [] if config["from_qrels"]: qrels = get_qrels(config["relevance_judgements"] ) # qrels[(q, doc)] = rel q:str, rel:int ranked_documents = set([e[1] for e in qrels])
logging.info('Config: ' + json.dumps(config, indent=2)) print("Data extraction\nConfiguration: ") print(json.dumps(config, indent=2), end='\n') print("Reading index ...") index = pyndri.Index(config["indexed_data"]) _, id2token, _ = index.get_dictionary() externelDocId = {} for doc in range(index.document_base(), index.maximum_document()): extD, _ = index.document(doc) externelDocId[extD] = doc queries = extractTopics(config["queries"]) queries_text = {} for q in queries: queries_text[q] = clean(queries[q], "krovetz", {}) out_trec_f = join(config["output_folder"], "trec_corpus.txt") out_t = codecs.open(out_trec_f, "w", encoding='utf8') qrels = get_qrels( config["relevance_judgements"] ) # dictionary: qrels[(q,doc)] = rel with q and rel are ints print("Collection2Text ...") nl = 0 relations = [] if bool(config["relevance_judgements"]) and not bool( config["run_file"]) and not bool(config["runs_folder"]): ranked_documents = set([e[1] for e in qrels]) print("totalling: %d documents" % len(ranked_documents)) nl = save_corpus(queries_text, ranked_documents, index, id2token,