コード例 #1
0
ファイル: VSM.py プロジェクト: kensk8er/QueryAsDocument
    enpickle(similarities, 'cache/similarities_' + date_time + '.pkl')
    enpickle(resume_indices, 'cache/resume_indices_' + date_time + '.pkl')
    enpickle(job_indices, 'cache/job_indices_' + date_time + '.pkl')

    # pick up n-most similar job posts and show them
    print 'pick up', n_result, 'most similar job posts for each resume...'
    results = get_n_most_similar_job_posts(similarity_matrix=similarities,
                                           n=n_result,
                                           resume_index_list=range(n_resume))  # resumes come after job posts

    print 'show recommendation results for each resume:\n'
    show_recommendation_results(result_lists=results, resume_indices=resume_indices, job_indices=job_indices)

    # calculate each metric based on relevancy judgements
    print 'load relevancy judgements...'
    relevancy_judgements = unpickle('data/relevancy/relevancy.pkl')

    print 'convert relevancy judgements into appropriate format...'
    relevancy_judgements = convert_relevancy_judgements(relevancy_judgements, job_indices, resume_indices)

    # calculate recall, precision, and f-score
    # note that this precision is same as precision@k
    print 'calculate precision, recall, and fscore...'
    recall_precision_fscores = calculate_recall_precision_fscore(results, relevancy_judgements, resume_indices)
    enpickle(recall_precision_fscores, 'result/recall_precision_fscores.pkl')

    print 'calculate average precision...'
    average_precision = calculate_average_precision(results, relevancy_judgements, resume_indices)
    enpickle(average_precision, 'result/average_precision.pkl')

    print 'calculate mean-average prevision...'
コード例 #2
0
    # load data
    report_dict = corpora.Dictionary.load('data/dictionary/report_(NN).dict')
    report_corpus = report_dict.corpus

    if use_wiki is True:
        # wiki_dict = corpora.Dictionary.load('data/dictionary/wiki_(NN).dict')
        # wiki_corpus = wiki_dict.corpus
        #
        # logging.info('combine report and wiki dictionary...')
        # wiki_to_report = report_dict.merge_with(wiki_dict)
        # merged_dict = report_dict
        #
        # logging.info('combine report and wiki corpus...')
        # merged_corpus = wiki_to_report[wiki_corpus].corpus + report_corpus
        logging.info('generate wiki corpus...')
        wiki_txt = unpickle('data/txt/processed_wiki.pkl')
        wiki_corpus = [report_dict.doc2bow(wiki) for wiki in wiki_txt]

        logging.info('combine report and wiki corpus...')
        merged_corpus = wiki_corpus + report_corpus

    # compute TFIDF
    # logging.info('compute TFIDF...')
    # tfidf = TfidfModel(dictionary=report_dict, id2word=report_dict)

    # perform LDA
    logging.info('perform LDA...')
    if use_wiki is True:
        lda = LdaModel(corpus=merged_corpus,
                       id2word=report_dict,
                       num_topics=num_topics,
コード例 #3
0

if __name__ == '__main__':
    # hyper-parameters
    allowed_pos = re.compile('(NN)')
    max_doc = float('inf')
    title_weight = 3

    # logging
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)

    # expand stopwords list
    stop_words = extended_stopwords

    logging.info('load documents...')
    documents = unpickle('data/txt/documents.pkl')

    logging.info('lemmatize...')
    count = 0
    doc_num = len(documents)
    new_documents = []
    titles = []
    froms = []
    dates = []
    for index, document in documents.items():
        count += 1
        if count > max_doc:
            break

        print '\r', count, '/', doc_num,
        text = document['text'] + (' ' + index) * title_weight  # incorporate title information
コード例 #4
0
    # calculate norms
    print 'calculate norms...'
    norms = np.sqrt(np.multiply(document_matrix, document_matrix).sum(1))
    norm_matrix = np.dot(norms, norms.T)

    # calculate similarities
    print 'calculate similarities...'
    similarity_matrix = inner_product_matrix / norm_matrix

    return similarity_matrix


if __name__ == '__main__':
    print 'read documents...'
    documents = unpickle('data/txt/documents.pkl')
    doc_num = len(documents)

    # convert dictionary format into list format
    print 'convert dictionary into list format...'
    doc_lists, doc_indices = convert_dict_list(documents)

    # Perform an IDF normalization on the output of HashingVectorizer
    hasher = HashingVectorizer(stop_words='english', non_negative=True,
                               norm=None, binary=False)
    vectorizer = Pipeline((
        ('hasher', hasher),
        ('tf_idf', TfidfTransformer())  # TODO: you should try many different parameters here
    ))

    # reduce the number of documents for now
コード例 #5
0
def gen_p_z_d(p_dz, p_d):
    """
    Generate P(z|d) out of P(d,z), and P(d).

    Based on Bayes' rule: P(z|d) = P(d,z) / P(d)

    :param p_dz: P(z,d)
    :param p_d: P(d)
    :return: P(z|d)
    """
    return (p_dz / p_d.reshape((-1, 1))).T


if __name__ == "__main__":
    p_w_z = unpickle("result/plsa/p_w_z.pkl")
    p_d_z = unpickle("result/plsa/p_d_z.pkl")
    p_z = unpickle("result/plsa/p_z.pkl")

    print "computing P(w)..."
    p_w = gen_p_w(p_w_z, p_z)

    print "computing P(z,w)..."
    p_wz = gen_p_wz(p_w_z, p_z)

    print "computing P(z|w)..."
    p_z_w = gen_p_z_w(p_wz, p_w)

    # print 'computing P(w|z) / P(w) = P(z,w) / {P(z) * P(w)}...'
    # p_w_z_w = gen_p_w_z_w(p_w_z, p_w)
コード例 #6
0
def gen_p_z_d(p_dz, p_d):
    """
    Generate P(z|d) out of P(d,z), and P(d).

    Based on Bayes' rule: P(z|d) = P(d,z) / P(d)

    :param p_dz: P(z,d)
    :param p_d: P(d)
    :return: P(z|d)
    """
    return (p_dz / p_d.reshape((-1, 1))).T


if __name__ == '__main__':
    p_w_z = unpickle('result/plsa/p_w_z.pkl')
    p_d_z = unpickle('result/plsa/p_d_z.pkl')
    p_z = unpickle('result/plsa/p_z.pkl')

    print 'computing P(w)...'
    p_w = gen_p_w(p_w_z, p_z)

    print 'computing P(z,w)...'
    p_wz = gen_p_wz(p_w_z, p_z)

    print 'computing P(z|w)...'
    p_z_w = gen_p_z_w(p_wz, p_w)

    #print 'computing P(w|z) / P(w) = P(z,w) / {P(z) * P(w)}...'
    #p_w_z_w = gen_p_w_z_w(p_w_z, p_w)
コード例 #7
0
    enpickle(wikis, 'data/txt/wiki.pkl')
    return wikis


if __name__ == '__main__':
    # hyper-parameters
    allowed_pos = re.compile('(NN)')
    crawl = False

    # logging
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)

    if crawl is True:
        wikis = crawl_wiki()
    else:
        wikis = unpickle('data/txt/wiki.pkl')

    # expand stopwords list
    stop_words = extended_stopwords

    logging.info('lemmatize...')
    count = 0
    doc_num = len(wikis)
    new_wikis = []
    keywords = []
    for keyword, wiki in wikis.items():
        count += 1

        print '\r', count, '/', doc_num,
        text = wiki['text']
        cleaned = clean_text(text)  # delete irrelevant characters
コード例 #8
0
    # load data
    report_dict = corpora.Dictionary.load('data/dictionary/report_(NN).dict')
    report_corpus = report_dict.corpus

    if use_wiki is True:
        # wiki_dict = corpora.Dictionary.load('data/dictionary/wiki_(NN).dict')
        # wiki_corpus = wiki_dict.corpus
        #
        # logging.info('combine report and wiki dictionary...')
        # wiki_to_report = report_dict.merge_with(wiki_dict)
        # merged_dict = report_dict
        #
        # logging.info('combine report and wiki corpus...')
        # merged_corpus = wiki_to_report[wiki_corpus].corpus + report_corpus
        logging.info('generate wiki corpus...')
        wiki_txt = unpickle('data/txt/processed_wiki.pkl')
        wiki_corpus = [report_dict.doc2bow(wiki) for wiki in wiki_txt]

        logging.info('combine report and wiki corpus...')
        merged_corpus = wiki_corpus + report_corpus

    # compute TFIDF
    # logging.info('compute TFIDF...')
    # tfidf = TfidfModel(dictionary=report_dict, id2word=report_dict)

    # perform LDA
    logging.info('perform LDA...')
    if use_wiki is True:
        lda = LdaModel(corpus=merged_corpus, id2word=report_dict, num_topics=num_topics, passes=passes,
                       iterations=iterations, alpha='auto', chunksize=chunksize)
        lda.save('result/model_wiki.lda')
コード例 #9
0
    return wikis


if __name__ == '__main__':
    # hyper-parameters
    allowed_pos = re.compile('(NN)')
    crawl = False

    # logging
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.DEBUG)

    if crawl is True:
        wikis = crawl_wiki()
    else:
        wikis = unpickle('data/txt/wiki.pkl')

    # expand stopwords list
    stop_words = extended_stopwords

    logging.info('lemmatize...')
    count = 0
    doc_num = len(wikis)
    new_wikis = []
    keywords = []
    for keyword, wiki in wikis.items():
        count += 1

        print '\r', count, '/', doc_num,
        text = wiki['text']
        cleaned = clean_text(text)  # delete irrelevant characters