def streamer(): for text in texts(drug=drug): text = tokenize(text,drug=drug,pos_filter=False) # list of tokens for i,word in enumerate(text): # remap brand drug names remap = _drug_dict.get(word.upper(),None) if remap is not None: text[i] = remap.lower() text = [stemmer.stem(word) for word in text] yield text
def word_count(drug=None,limit=None,pos_filter=False,lemma=True): """Scans comment texts (from drug_mentions.texts) for selected drug, calculates most common words. KWARGS: drug: string or None. Drug selector. Allows three cases: * None: scrape all comments in database, regardless of drug. * 'antidepressant': select comments speaking generically about drug, not referencing specific drug. * [drug name]: comments referencing specific drug. Default None. Passed to drug_mentions.texts. limit: int or None. Optional limit on SQL queries retrieved by drug_mentions.texts. Defaults to None (returns all hits). pos_filter: boolean. Passed to tokenize(), set True to use part-of-speech filtering. lemma: boolean. Passed to tokenize(), set True to use lemmatization. RETURNS: freq: nltk.probability.FreqDist object. Frequency distribution of words from comments. RAISES: ValueError: for invalid drug name. """ try: texts = dm.texts(drug=drug,limit=limit) except ValueError: raise ValueError('Invalid drug name.') freq = FreqDist() for text in texts: freq.update(tokenize(text,drug,pos_filter=pos_filter,lemma=lemma)) return freq