def get_TermsFrequency(query_terms):
   """
   Count the frequency of MESH terms being used in all abstracts
   """
   abst_data = eUtils.fetch_abstr(query_terms)

   # node size: freq of term in all abstracts
   MESH_frequency = defaultdict(int)

   # edge weights: freq of bi-terms in all abstracts
   MESH_bifrequency = defaultdict(int)

   for abst in abst_data:
      MESH = abst['MESH']
      MESH_terms = sorted([i.strip(' ') for i in str(MESH).split(',')])
      # all bi-terms combinations
      MESH_biterms = itertools.combinations(MESH_terms, 2)
      for term in MESH_terms:
         MESH_frequency[term] += 1
      for biterm in MESH_biterms:
         MESH_bifrequency[biterm] += 1

   sorted_MESH_frequency = sorted(MESH_frequency.iteritems(), 
                                  key =operator.itemgetter(1), reverse=True)

   # filter the biterms at least occurs in 2 or more abstracts
   filter_MESH_bifrequency = dict((k,v) for k,v in MESH_bifrequency.iteritems()\
                                  if v > 1)
   sorted_MESH_bifrequency = sorted(filter_MESH_bifrequency.iteritems(), 
                                  key =operator.itemgetter(1), reverse=True)

   return (sorted_MESH_frequency, sorted_MESH_bifrequency)
Exemple #2
0
def try_to_update_term(data, term):
   # Spaces cause eUtils queries to fail.
   term = term.replace('\n', '').replace(' ', '+').upper()
   # Minimal check for term inconsistencies.
   for forbidden in ['/', ' ', 'CRDT', 'CRDAT']:
      if forbidden in term: raise TermException(forbidden)
   success = False
   try:
      # If we can create the micro-corpus with the new term,
      # then do the update. Otherwise something went wrong.
      abstr_sample = eUtils.fetch_abstr(
         term = term,
         retmax = config.RETMAX,
         email = config.ADMAIL
      )
      mu_corpus = {}
      for abstr in abstr_sample:
         mu_corpus[abstr['pmid']] = tfidf.preprocess(abstr['text'])
      data.mu_corpus = zlib.compress(json.dumps(mu_corpus))
   except (eUtils.PubMedException, eUtils.NoHitException):
      # PubMed error or no nit.
      success = False
   else:
      success = True

   data.term_valid = success
   data.term = term
   data.put()
   return success
def get_TermsFrequency(query_terms):
    """
   Count the frequency of MESH terms being used in all abstracts
   """
    abst_data = eUtils.fetch_abstr(query_terms)

    # node size: freq of term in all abstracts
    MESH_frequency = defaultdict(int)

    # edge weights: freq of bi-terms in all abstracts
    MESH_bifrequency = defaultdict(int)

    for abst in abst_data:
        MESH = abst['MESH']
        MESH_terms = sorted([i.strip(' ') for i in str(MESH).split(',')])
        # all bi-terms combinations
        MESH_biterms = itertools.combinations(MESH_terms, 2)
        for term in MESH_terms:
            MESH_frequency[term] += 1
        for biterm in MESH_biterms:
            MESH_bifrequency[biterm] += 1

    sorted_MESH_frequency = sorted(MESH_frequency.iteritems(),
                                   key=operator.itemgetter(1),
                                   reverse=True)

    # filter the biterms at least occurs in 2 or more abstracts
    filter_MESH_bifrequency = dict((k,v) for k,v in MESH_bifrequency.iteritems()\
                                   if v > 1)
    sorted_MESH_bifrequency = sorted(filter_MESH_bifrequency.iteritems(),
                                     key=operator.itemgetter(1),
                                     reverse=True)

    return (sorted_MESH_frequency, sorted_MESH_bifrequency)
Exemple #4
0
 def test_fetch_abstr(self):
    """This also tests 'SAXmed.eFetchResultHandler'."""
    query = u'nature[journal]+AND+2012/12/21[crdt]'
    target_ids = [
       u'23254940', u'23254938', u'23254936', u'23254935',
       u'23254933', u'23254931', u'23254930', u'23254929'
    ]
    ab_list = eUtils.fetch_abstr(query)
    self.assertEqual(len(ab_list), 8)
    self.assertEqual([ab['pmid'] for ab in ab_list], target_ids)
Exemple #5
0
      prog = 'getPubMedAbs_NLP_summarize.py',
      description = """ Get the abstract from pubmed query and the important sentences being bolded""",
      formatter_class=RawTextHelpFormatter
    )
    parser.add_argument(
      '-q',
      '--query',
      metavar = 'q',
      type = str,
      nargs = '?',
      default = '.',
      help = 'pubmed query'
    )

    args = parser.parse_args()

    # Load in output from blogs_and_nlp__get_feed.py
    abst_data = eUtils.fetch_abstr(args.query)

    #f = open('output/results_summary.html', 'w')
    sys.stdout.write('<html><head><meta charset="utf-8"></head><body>\n')
    sys.stdout.write('<h1>There are %d abstracts available </h1>' % (len(abst_data)))
    for abs in abst_data:
        sys.stdout.write('<br/><h4><a href="http://www.ncbi.nlm.nih.gov/pubmed/%s">' % abs['pmid'])
        sys.stdout.write(abs['title'].encode('utf-8') + '</a></h4><br/>')
        summary = summarize(abs['text'])
        summary = [i.encode('utf-8') for i in summary]
        sys.stdout.write(' '.join(summary))
        sys.stdout.write('<br/>-----------<br/>')
    sys.stdout.write('\n</body></html>')
Exemple #6
0
def get_hits_and_send_mail(data):
    """Routine to fetch user's hits and send them the results."""
    # We query PubMed for the entries created yesterday.
    # There is a bit of variability on the update time,
    # so one might miss the entries of today if they are
    # put after the cron time.
    yesterday = date.today() - timedelta(1)
    the_day_before = yesterday - timedelta(1)
    one_year_ago = yesterday - timedelta(365)

    term = str(data.term)
    term_yesterday = "(" + term + ")" + yesterday.strftime("+AND+(%Y%%2F%m%%2F%d[crdt])")
    term_older = (
        "("
        + term
        + ")"
        + one_year_ago.strftime("+AND+(%Y%%2F%m%%2F%d:")
        + the_day_before.strftime("%Y%%2F%m%%2F%d[crdt])")
    )

    # Fetch the abstracts.
    abstr_list = []
    try:
        abstr_list = eUtils.fetch_abstr(
            term=term_yesterday,
            # Limit on all queries, to keep it light.
            retmax=config.RETMAX,
            email=config.ADMAIL,
        )
    except eUtils.NoHitException:
        return
    except eUtils.PubMedException as e:
        logging.warn("%s: %s" % (data.user.email(), str(e)))
    # Can be empty.  No big deal, just return.
    if not abstr_list:
        return

    user_gave_relevance_feedback = utils.decrypt(data, "relevant_docs") and utils.decrypt(data, "irrelevant_docs")

    if not user_gave_relevance_feedback:
        # No relevance feedback: set all scores to 0 and move on.
        for abstr in abstr_list:
            abstr["score"] = 0.0

    else:
        # User gave feedback: recall their data and compute scores.
        relevant_docs = utils.decrypt(data, "relevant_docs")
        irrelevant_docs = utils.decrypt(data, "irrelevant_docs")
        mu_corpus = utils.decrypt(data, "mu_corpus")

        # Write the scores in place and sort.
        Classify.update_score_inplace(abstr_list, relevant_docs, irrelevant_docs, mu_corpus)

        abstr_list = sorted(abstr_list, key=lambda x: x.get("score", 0.0), reverse=True)

    # Set a limit on hit number.
    nhits = len(abstr_list)
    if nhits < config.MAXHITS + 1:
        maxhit_exceeded = ""
    else:
        # Send the top of the sorted list and notify the user.
        maxhit_exceeded = "Showing only the top %d hits." % config.MAXHITS
        abstr_list = abstr_list[: config.MAXHITS]

    ## Alchemy test.
    if data.user.email() == "*****@*****.**":
        for abstr in abstr_list:
            query = json.loads(alchemy_keyword_query(abstr.get("text")))
            abstr["keywords"] = [kw["text"] for kw in query["keywords"]]

    # Make a security checksum.
    # 1. Concatenate the PMIDs.
    pmids = "".join(sorted([a["pmid"] for a in abstr_list]))
    # 2. Add the random salt, and compute the SHA1 digest.
    checksum = sha1(pmids + data.salt).hexdigest()

    template_vals = {
        "nhits": nhits,
        "maxhit_exceeded": maxhit_exceeded,
        "uid": data.user.user_id(),
        "checksum": checksum,
        "abstr_list": abstr_list,
    }
    # Create the hits email message and send.
    msg = mail.EmailMessage()
    msg.initialize(
        to=data.user.email(),
        sender="*****@*****.**",
        subject="Recently on PubMed",
        body="Message in HTML format.",
        html=utils.render("mail.html", template_vals),
    )
    msg.send()
    logging.warn("mail sent to %s" % data.user.email())
    return