out = config["output"]

    # process queries:
    queries = extract_trec_million_queries(config["queries"])
    queries_text = {}
    stoplist = set(stopwords.words("english")) if config["stopwords"] else {}
    qrels_MQ = []  # get only judged queries
    for file in os.listdir(config["qrels_MQ"]):
        qrels_MQ += list(read_values(os.path.join(config["qrels_MQ"], file),
                                     0))
    qrels_MQ = set(qrels_MQ)
    q_times = defaultdict(int)
    print("Pre-process queries %d queries..." % len(qrels_MQ))
    logging.info("Pre-process queries %d queries..." % len(qrels_MQ))
    for q in tqdm(qrels_MQ):
        q_text = clean(queries[q], config["stemmer"], stoplist)
        q_times[q_text] += 1  # queries with duplicate content
        queries_text[q] = q_text if q_times[q_text] == 1 else ' '.join(
            [q_text, str(q_times[q_text])])
        if q_times[q_text] > 1:
            logging.info("Duplicated: " + q + '\t' +
                         ' '.join([q_text, str(q_times[q_text])]))

    # retrieve passages
    logging.info("Retrieving ...")
    print("Wait while retrieving passages for different queries ...")
    temp_parameters = join(config["output"], "temp_Q.xml")
    for q_id in qrels_MQ:
        q_txt = queries_text[q_id]
        logging.info(q_id + '\t' + q_txt)
        c_root = ET.Element("parameters")
# read the evaluation file, get it at:
# https://word2vec.googlecode.com/svn/trunk/questions-words.txt
questions = 'questions-words.txt'
evals = open(questions, 'r').readlines()
num_sections = len([l for l in evals if l.startswith(':')])
print('total evaluation sentences: {} '.format(len(evals) - num_sections))
# total evaluation sentences: 19544

# clean questions
print("Preprocessing questions ...")
out = open("pre_processed_" + questions, "w")
for l in tqdm(evals):
    if l.startswith(':'):
        out.write(l)
    else:
        pre_processed = clean(l, "krovetz", [])
        out.write(pre_processed + "\n")

questions = "pre_processed_" + questions

# load the pre-trained model of GoogleNews dataset (100 billion words), get it at:
# https://code.google.com/p/word2vec/#Pre-trained_word_and_phrase_vectors
google = Word2Vec.load_word2vec_format(
    '/home/thiziri/Documents/DOCTORAT/osirim_data/projets/iris/PROJETS/WEIR/collections/constructed/local_embeddings/Robust/Robust_skipgram_wordEmbedding_dim300_win10_minCount5.txt',
    binary=False)  #('GoogleNews-vectors-negative300.bin', binary=True)
# test the model accuracy*
w2v_model_accuracy(google, questions)
#Total sentences: 7614, Correct: 74.26%, Incorrect: 25.74%

# *took around 1hr45mins on Mac Book Pro (3.1 GHz Intel Core i7)
    if config["train_queries"] == config["test_queries"]:
        queries = extractTopics(config["train_queries"]) if config["train_queries_format"] == "trec"\
            else extract_trec_million_queries(config["train_queries"])
    else:
        train_queries = extractTopics(config["train_queries"]) if config["train_queries_format"] == "trec" \
            else extract_trec_million_queries(config["train_queries"])
        test_queries = extractTopics(config["test_queries"]) if config["test_queries_format"] == "trec" \
            else extract_trec_million_queries(config["test_queries"])
        queries = {**train_queries, **test_queries}
    print("{n} queries to process.".format(n=len(queries)))

    queries_text = {}
    q_times = defaultdict(int)
    print("Preprocess queries ...")
    for q in tqdm(queries):
        q_text = clean(queries[q], "krovetz", {})
        q_times[q_text] += 1
        queries_text[q] = q_text if q_times[q_text] == 1 else ' '.join(
            [q_text, str(q_times[q_text])])

    out_trec_f = join(config["output_folder"], "trec_corpus.txt")
    out_t = codecs.open(out_trec_f, "w", encoding='utf8')

    print("Collection2Text ...")
    nl = 0
    relations = []

    if config["from_qrels"]:
        qrels = get_qrels(config["relevance_judgements"]
                          )  # qrels[(q, doc)] = rel q:str, rel:int
        ranked_documents = set([e[1] for e in qrels])
Beispiel #4
0
    logging.info('Config: ' + json.dumps(config, indent=2))

    print("Data extraction\nConfiguration: ")
    print(json.dumps(config, indent=2), end='\n')

    print("Reading index ...")
    index = pyndri.Index(config["indexed_data"])
    _, id2token, _ = index.get_dictionary()
    externelDocId = {}
    for doc in range(index.document_base(), index.maximum_document()):
        extD, _ = index.document(doc)
        externelDocId[extD] = doc
    queries = extractTopics(config["queries"])
    queries_text = {}
    for q in queries:
        queries_text[q] = clean(queries[q], "krovetz", {})

    out_trec_f = join(config["output_folder"], "trec_corpus.txt")
    out_t = codecs.open(out_trec_f, "w", encoding='utf8')
    qrels = get_qrels(
        config["relevance_judgements"]
    )  # dictionary: qrels[(q,doc)] = rel with q and rel are ints

    print("Collection2Text ...")
    nl = 0
    relations = []
    if bool(config["relevance_judgements"]) and not bool(
            config["run_file"]) and not bool(config["runs_folder"]):
        ranked_documents = set([e[1] for e in qrels])
        print("totalling: %d documents" % len(ranked_documents))
        nl = save_corpus(queries_text, ranked_documents, index, id2token,