enpickle(similarities, 'cache/similarities_' + date_time + '.pkl') enpickle(resume_indices, 'cache/resume_indices_' + date_time + '.pkl') enpickle(job_indices, 'cache/job_indices_' + date_time + '.pkl') # pick up n-most similar job posts and show them print 'pick up', n_result, 'most similar job posts for each resume...' results = get_n_most_similar_job_posts(similarity_matrix=similarities, n=n_result, resume_index_list=range(n_resume)) # resumes come after job posts print 'show recommendation results for each resume:\n' show_recommendation_results(result_lists=results, resume_indices=resume_indices, job_indices=job_indices) # calculate each metric based on relevancy judgements print 'load relevancy judgements...' relevancy_judgements = unpickle('data/relevancy/relevancy.pkl') print 'convert relevancy judgements into appropriate format...' relevancy_judgements = convert_relevancy_judgements(relevancy_judgements, job_indices, resume_indices) # calculate recall, precision, and f-score # note that this precision is same as precision@k print 'calculate precision, recall, and fscore...' recall_precision_fscores = calculate_recall_precision_fscore(results, relevancy_judgements, resume_indices) enpickle(recall_precision_fscores, 'result/recall_precision_fscores.pkl') print 'calculate average precision...' average_precision = calculate_average_precision(results, relevancy_judgements, resume_indices) enpickle(average_precision, 'result/average_precision.pkl') print 'calculate mean-average prevision...'
# load data report_dict = corpora.Dictionary.load('data/dictionary/report_(NN).dict') report_corpus = report_dict.corpus if use_wiki is True: # wiki_dict = corpora.Dictionary.load('data/dictionary/wiki_(NN).dict') # wiki_corpus = wiki_dict.corpus # # logging.info('combine report and wiki dictionary...') # wiki_to_report = report_dict.merge_with(wiki_dict) # merged_dict = report_dict # # logging.info('combine report and wiki corpus...') # merged_corpus = wiki_to_report[wiki_corpus].corpus + report_corpus logging.info('generate wiki corpus...') wiki_txt = unpickle('data/txt/processed_wiki.pkl') wiki_corpus = [report_dict.doc2bow(wiki) for wiki in wiki_txt] logging.info('combine report and wiki corpus...') merged_corpus = wiki_corpus + report_corpus # compute TFIDF # logging.info('compute TFIDF...') # tfidf = TfidfModel(dictionary=report_dict, id2word=report_dict) # perform LDA logging.info('perform LDA...') if use_wiki is True: lda = LdaModel(corpus=merged_corpus, id2word=report_dict, num_topics=num_topics,
if __name__ == '__main__': # hyper-parameters allowed_pos = re.compile('(NN)') max_doc = float('inf') title_weight = 3 # logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) # expand stopwords list stop_words = extended_stopwords logging.info('load documents...') documents = unpickle('data/txt/documents.pkl') logging.info('lemmatize...') count = 0 doc_num = len(documents) new_documents = [] titles = [] froms = [] dates = [] for index, document in documents.items(): count += 1 if count > max_doc: break print '\r', count, '/', doc_num, text = document['text'] + (' ' + index) * title_weight # incorporate title information
# calculate norms print 'calculate norms...' norms = np.sqrt(np.multiply(document_matrix, document_matrix).sum(1)) norm_matrix = np.dot(norms, norms.T) # calculate similarities print 'calculate similarities...' similarity_matrix = inner_product_matrix / norm_matrix return similarity_matrix if __name__ == '__main__': print 'read documents...' documents = unpickle('data/txt/documents.pkl') doc_num = len(documents) # convert dictionary format into list format print 'convert dictionary into list format...' doc_lists, doc_indices = convert_dict_list(documents) # Perform an IDF normalization on the output of HashingVectorizer hasher = HashingVectorizer(stop_words='english', non_negative=True, norm=None, binary=False) vectorizer = Pipeline(( ('hasher', hasher), ('tf_idf', TfidfTransformer()) # TODO: you should try many different parameters here )) # reduce the number of documents for now
def gen_p_z_d(p_dz, p_d): """ Generate P(z|d) out of P(d,z), and P(d). Based on Bayes' rule: P(z|d) = P(d,z) / P(d) :param p_dz: P(z,d) :param p_d: P(d) :return: P(z|d) """ return (p_dz / p_d.reshape((-1, 1))).T if __name__ == "__main__": p_w_z = unpickle("result/plsa/p_w_z.pkl") p_d_z = unpickle("result/plsa/p_d_z.pkl") p_z = unpickle("result/plsa/p_z.pkl") print "computing P(w)..." p_w = gen_p_w(p_w_z, p_z) print "computing P(z,w)..." p_wz = gen_p_wz(p_w_z, p_z) print "computing P(z|w)..." p_z_w = gen_p_z_w(p_wz, p_w) # print 'computing P(w|z) / P(w) = P(z,w) / {P(z) * P(w)}...' # p_w_z_w = gen_p_w_z_w(p_w_z, p_w)
def gen_p_z_d(p_dz, p_d): """ Generate P(z|d) out of P(d,z), and P(d). Based on Bayes' rule: P(z|d) = P(d,z) / P(d) :param p_dz: P(z,d) :param p_d: P(d) :return: P(z|d) """ return (p_dz / p_d.reshape((-1, 1))).T if __name__ == '__main__': p_w_z = unpickle('result/plsa/p_w_z.pkl') p_d_z = unpickle('result/plsa/p_d_z.pkl') p_z = unpickle('result/plsa/p_z.pkl') print 'computing P(w)...' p_w = gen_p_w(p_w_z, p_z) print 'computing P(z,w)...' p_wz = gen_p_wz(p_w_z, p_z) print 'computing P(z|w)...' p_z_w = gen_p_z_w(p_wz, p_w) #print 'computing P(w|z) / P(w) = P(z,w) / {P(z) * P(w)}...' #p_w_z_w = gen_p_w_z_w(p_w_z, p_w)
enpickle(wikis, 'data/txt/wiki.pkl') return wikis if __name__ == '__main__': # hyper-parameters allowed_pos = re.compile('(NN)') crawl = False # logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) if crawl is True: wikis = crawl_wiki() else: wikis = unpickle('data/txt/wiki.pkl') # expand stopwords list stop_words = extended_stopwords logging.info('lemmatize...') count = 0 doc_num = len(wikis) new_wikis = [] keywords = [] for keyword, wiki in wikis.items(): count += 1 print '\r', count, '/', doc_num, text = wiki['text'] cleaned = clean_text(text) # delete irrelevant characters
# load data report_dict = corpora.Dictionary.load('data/dictionary/report_(NN).dict') report_corpus = report_dict.corpus if use_wiki is True: # wiki_dict = corpora.Dictionary.load('data/dictionary/wiki_(NN).dict') # wiki_corpus = wiki_dict.corpus # # logging.info('combine report and wiki dictionary...') # wiki_to_report = report_dict.merge_with(wiki_dict) # merged_dict = report_dict # # logging.info('combine report and wiki corpus...') # merged_corpus = wiki_to_report[wiki_corpus].corpus + report_corpus logging.info('generate wiki corpus...') wiki_txt = unpickle('data/txt/processed_wiki.pkl') wiki_corpus = [report_dict.doc2bow(wiki) for wiki in wiki_txt] logging.info('combine report and wiki corpus...') merged_corpus = wiki_corpus + report_corpus # compute TFIDF # logging.info('compute TFIDF...') # tfidf = TfidfModel(dictionary=report_dict, id2word=report_dict) # perform LDA logging.info('perform LDA...') if use_wiki is True: lda = LdaModel(corpus=merged_corpus, id2word=report_dict, num_topics=num_topics, passes=passes, iterations=iterations, alpha='auto', chunksize=chunksize) lda.save('result/model_wiki.lda')
return wikis if __name__ == '__main__': # hyper-parameters allowed_pos = re.compile('(NN)') crawl = False # logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) if crawl is True: wikis = crawl_wiki() else: wikis = unpickle('data/txt/wiki.pkl') # expand stopwords list stop_words = extended_stopwords logging.info('lemmatize...') count = 0 doc_num = len(wikis) new_wikis = [] keywords = [] for keyword, wiki in wikis.items(): count += 1 print '\r', count, '/', doc_num, text = wiki['text'] cleaned = clean_text(text) # delete irrelevant characters