def update_score_inplace(abstr_list, relevant_docs, irrelevant_docs, mu_corpus=[]): """Update in place the 'score' field of JSON-like documents. The relevance score of a given abstract a is (math notation): max {cosine(a,r) | r relevant} - max {cosine(a,i) | i rrelevant}""" new_texts = [abstr['text'] for abstr in abstr_list] new_tfidfs = tfidf.compute_from_texts(new_texts, mu_corpus) for (doc, new_tfidf) in zip(abstr_list, new_tfidfs): cosine_relevant = [ tfidf.cosine(new_tfidf, relevant_doc['tfidf']) \ for relevant_doc in relevant_docs ] cosine_irrelevant = [ tfidf.cosine(new_tfidf, irrelevant_doc['tfidf']) \ for irrelevant_doc in irrelevant_docs ] doc['score'] = 10 * round( max(cosine_relevant) - max(cosine_irrelevant), 3 )
def post(self): # Who is it? Get it from the POST parameters. uid = self.request.get('uid') data = models.UserData.get_by_key_name(uid) # Check that POST is issued from PubCron mail. checksum = self.validate_request(data) if not self.request.get('checksum'): # Could not check identity (hacked?!!): good-bye. return # Identity check successful. Do the update. new_relevant_pmids = [] new_irrelevant_pmids = [] # Process key/value pairs. for name in self.request.arguments(): # NB: only PMID update correspond to 'name' equal to # "Yes" or "No". The other cases are either no answer # or non PMID POST paramters (like uid or checksum). if self.request.get(name) == 'Yes': new_relevant_pmids += [name] elif self.request.get(name) == 'No': new_irrelevant_pmids += [name] # It is unlikely that a malicious request went # until here, but because we are about to save user- # submitted data, we do a validity (security) check. pmids_to_update = new_relevant_pmids + new_irrelevant_pmids if not self.validate_pmid(pmids_to_update): # Validation failed: good-bye. return # From here, PMIDs have been parsed and checked. # Now recall and parse user JSON data. mu_corpus = utils.decrypt(data, 'mu_corpus') relevant_docs = utils.decrypt(data, 'relevant_docs') irrelevant_docs = utils.decrypt(data, 'irrelevant_docs') # Clear new docs from user data (in case users are notifying # that they change their mind on relevance). pmids_to_update = new_relevant_pmids + new_irrelevant_pmids for relevant_then_irrelevant in (relevant_docs, irrelevant_docs): for doc in relevant_then_irrelevant: if doc.get('pmid') in pmids_to_update: relevant_then_irrelevant.remove(doc) # Now, get the PubMed data and compute tf-idf. for (new_ids, doc_list) in ( (new_relevant_pmids, relevant_docs), (new_irrelevant_pmids, irrelevant_docs)): new_docs = eUtils.fetch_ids(new_ids) new_tfidf = tfidf.compute_from_texts( [abstr.get('text', '') for abstr in new_docs], mu_corpus.values() ) for (doc, tfidf_dict) in zip (new_docs, new_tfidf): # Keep only fields 'pmid' and 'title'. for field_name in doc.keys(): if not field_name in ('pmid', 'title'): doc.pop(field_name, None) # Add field 'tfidf'. doc['tfidf'] = tfidf_dict # Append to user data. doc_list.extend(new_docs) and_finally_remove_junk_from(doc_list) # Update the documents... data.relevant_docs = zlib.compress(json.dumps(relevant_docs)) data.irrelevant_docs = zlib.compress(json.dumps(irrelevant_docs)) # ... and put. data.put() # Now reassure the user. self.response.out.write(utils.render('feedback.html'))