Beispiel #1
0
    def filter_string(self, file):
        stop_word = stop_words()
        list1 = []
        finger_string = ""

        for line in file:
            list1.extend(line.split(" "))
        if len(list1) != 0:

            for word in list1:
                if word not in stop_word:
                    word = word.strip("\n")
                    word_list = list(word)
                    word_list2 = []
                    for i in word_list:

                        if (ord(i) >= 65 and ord(i) <= 90
                                or ord(i) >= 97 and ord(i) <= 122
                                or ord(i) >= 48 and ord(i) <= 57 and i == "_"):
                            word_list2.append(i)

                    word1 = ''.join(word_list2)
                    finger_string += word1.lower()
            return (finger_string)
        else:
            return None
Beispiel #2
0
    def filter_string(self, file):
        ''' This function takes an opened file as input,
			For each line in the file, it removes special characters, stop words and spaces
			 Appends the left over characters all into one string'''

        stop_word = stop_words(
        )  # A function that returns all the frequently used words
        list1 = []
        finger_string = ""

        for line in file:
            list1.extend(line.split(" "))
        if len(list1) != 0:

            for word in list1:
                if word not in stop_word:
                    word = word.strip("\n")
                    word_list = list(word)
                    word_list2 = []
                    for i in word_list:

                        if (ord(i) >= 65 and ord(i) <= 90
                                or ord(i) >= 97 and ord(i) <= 122
                                or ord(i) >= 48 and ord(i) <= 57 and i == "_"):
                            word_list2.append(i)

                    word1 = ''.join(word_list2)
                    finger_string += word1.lower()
            return (finger_string)
        else:
            return None
Beispiel #3
0
 def rm_stop_words(self, tokens):
     s = stop_words.stop_words()
     sw = s.stop_words
     result_tokens = []
     for token_one in tokens:
         if token_one not in sw:
             result_tokens.append(token_one)
     return result_tokens
Beispiel #4
0
def vocab_filter(url,pubs,min_shared_vocab_size=2,vocab_use_pct=1.0):
	"""
	Determine which publications are valid based on the vocabulary
	provided by the URL (which is assumed to represent the field and publication
	topics of the person named).
	"""
	accepted_pubs = []

	
	# read the URL
	#print 'Obtaining URL data...'
	url_words = None
	if url.endswith('.pdf'):
		url_words = __read_pdf_text(url)
	else:
		fh = urlopen(url)
		url_content = '\n'.join(fh.readlines())
		url_words = set(map(lambda x: x.lower(), re.findall('[A-Za-z-]+',url_content))) 

	swords = stop_words()
	url_words.difference_update(swords)
	
	# if we're only supposed to use some of the URL words, then subsample as appropriate.
	num_words_to_remove = int(math.ceil(float(len(url_words)) * (1.0 - vocab_use_pct)))
	for i in range(num_words_to_remove):
		url_words.pop()
	
	######
	# Filter the publications
	#print 'Filtering pubications...'
	
	# filter based on name
	accepted_pubs = []
	for pub in pubs:			
		# check word content
		pub_words = set(map(lambda x: x.lower(), re.findall('[A-Za-z-]+',pub.title))) #.union(set(map(lambda x: x.lower(), re.findall('\w+',pub.source))))
		pub_words.difference_update(swords)
		
		shared_words = url_words.intersection(pub_words)
			
		if len(shared_words) < min_shared_vocab_size:
			continue
		
		#print pub.title,shared_words
		# if we got here, then the publication is ok!
		accepted_pubs.append(pub)	

	return accepted_pubs
Beispiel #5
0
def tokenize(string):

    # Tokenizes the string
    tokens = string.split(" ")

    # Removes tokens with wordlenght < 3
    new_tokens = []
    for token in tokens:
        if len(token) > 2:
            new_tokens.append(token)

    # Remove stopwords
    sw = stop_words()
    word_tokens = new_tokens
    filtered_tokens = []
    for w in word_tokens:
        if w not in sw:
            filtered_tokens.append(w)
    return filtered_tokens
Beispiel #6
0
def build_chunks(drug, classifier, limit=None):
    """Pulls comment data from SQL table, constructs trees for each, chunks by 
	drug mention, writes to Chunks SQL table organized by drug.

	ARGS:
		drug: string.
			drug name.
		classifier: nltk.classify.NaiveBayesClassifier object.
			trained Naive Bayes classifier.

	KWARGS:
		limit: int or None.
			optional cap on number of comments streamed through processor.

	RAISES:
		ValueError:
			if invalid drug is input.
	"""
    try:
        drug = _drug_dict[drug.upper()]
    except:
        raise ValueError("invalid drug")

    def uniconvert(s):
        if s == '\x00':
            return 0
        elif s == '\x01':
            return 1
        else:
            return None

    conn = pms.connect(host='localhost',
                       user='******',
                       passwd='',
                       db='empath',
                       charset='utf8',
                       init_command='SET NAMES UTF8')
    cur = conn.cursor()

    # assemble the mother of all queries
    query = "SELECT c.id,c.body,m.count"
    for gen in _generics:
        query += (",m.%s" % gen.lower())
    query += " FROM Comments c JOIN Subreddits s on c.subreddit=s.subreddit "
    query += "JOIN Mentions m on c.id=m.id WHERE (m.count=1 OR m.count=2) "
    query += ("AND m.%s=True AND c.chunked=False" % drug.lower())
    if limit is not None:
        query += (" LIMIT %s" % limit)
    cur.execute(query)
    conn.close()

    for row in cur:
        post_id = row[0]
        body = row[1]
        count = row[2]
        drugs = np.array([uniconvert(d) for d in row[3:]])
        dmap = np.where(drugs == 1)
        drugs = [d.lower() for d in list(np.array(_generics)[dmap])]

        # clean body text
        body = body.lower()
        for drug in drugs:
            for remap in _gen_dict.get(drug.upper(), [drug.upper()]):
                body = body.replace(remap.lower(), drug.lower())

        trees, sentiments = build_tree(body, drugs)
        subtexts, mentions, precedence = map_subtrees(trees, drugs)

        for i, drug in enumerate(OrderedSet(precedence)):
            drugtext = []
            for subtext in subtexts[drug]:
                for word in subtext:
                    drugtext.append(word)
            drugtext = [
                word for word in drugtext if word not in set(stop_words())
            ]
            sents = []
            for j, men in enumerate(mentions):
                if len(men) == 0:
                    men = ['preamble']
                if drug in men:
                    sents.append(sentiments[j])

            nbsent = classifier.prob_classify(
                dict([(word, True) for word in drugtext
                      ])).prob('pos')  # probability positive

            data = (post_id, i, drug, drugtext, sents, nbsent)
            yield data
Beispiel #7
0
from authorstats import compute_individual_stats, obtain_individual_pubs
import csv
from oryx.env import rb
import math
from pubfilter import *
import stop_words
import pylab as pl
from pubmodel import Publication
import pubstats

swords = stop_words.stop_words()

YEAR_FILTER = 'YEAR'
VOCAB_FILTER = 'VOCAB'
NAME_FILTER = 'NAME'
CONFLICT_FILTER = 'CONFLICT'
DUPLICATE_FILTER = 'DUPLICATE'

BEST_FILTERS = [YEAR_FILTER,VOCAB_FILTER,NAME_FILTER,DUPLICATE_FILTER]

def compute_test_case_stats(name,url,dfile,filters=BEST_FILTERS,min_vocab_match_size=2,vocab_use_pct=1.0):
	"""
	Results:
		# of total pubs found,# of found pubs,# true pubs,TP: # of matching pubs,FP, FN: # of unaccepted matching pubs
	"""
	use_initials = False
	if name.startswith('^'):
		use_initials = True
		name = name[1:]
		
	# load the true pubs
Beispiel #8
0
def tokenize(text,drug=None,pos_filter=False,lemma=True):
	"""Simple (or not) tokenizer for given text block.

	ARGS:
		text: string.
			Single comment block.

	KWARGS:
		drug: string or None.
			drug name (added to stoplist to prevent self-mentions)
		pos_filter: boolean.
			set True to use part-of-speech filtering.
		lemma: boolean.
			set True to use lemmatization.

	RETURNS:
		words: list.
			List of lower-case word tokens (individual strings)
	"""
	tokens = nltk.RegexpTokenizer(r'\w+').tokenize(text.lower())
	merger = nltk.MWETokenizer([('side','effect'),('side','effects')])
	tokens = merger.tokenize(tokens)
	
	# filter on stop words
	stops = sw.stop_words()
	if drug is not None:
		if drug.upper() != 'ANTIDEPRESSANT':
			stops.append(drug.lower())
			if _drug_dict[drug.upper()] != drug.upper():
				stops.append(_drug_dict[drug.upper()].lower())
			if drug.upper() in _gen_dict.keys():
				for bd in _gen_dict[drug.upper()]:
					stops.append(bd.lower())
		else:
			stops = stops+['antidepressant','antidepressants']
	stops = set(stops)
	tokens = [word for word in tokens if word not in stops]

	if pos_filter:
		tagged_tokens = nltk.pos_tag(tokens)
		tags = ['CD',
			'DT',
			'JJ',
			'JJR',
			'JJS',
			'NN',
			'NNP',
			'NNPS',
			'NNS',
			'RB',
			'RBR',
			'RBS',
			'VB',
			'VBD',
			'VBG',
			'VBN',
			'VBP',
			'VBZ']
		tokens = [word for (word,tag) in tagged_tokens if tag in tags]

	if lemma:
		tokens = [_lemmatizer.lemmatize(word,pos='v') for word in tokens]
		tokens = [_lemmatizer.lemmatize(word,pos='n') for word in tokens]

	# one more pass through stopword filter
	tokens = [word for word in tokens if word not in stops]

	return tokens
Beispiel #9
0
def build_chunks(drug,classifier,limit=None):
	"""Pulls comment data from SQL table, constructs trees for each, chunks by 
	drug mention, writes to Chunks SQL table organized by drug.

	ARGS:
		drug: string.
			drug name.
		classifier: nltk.classify.NaiveBayesClassifier object.
			trained Naive Bayes classifier.

	KWARGS:
		limit: int or None.
			optional cap on number of comments streamed through processor.

	RAISES:
		ValueError:
			if invalid drug is input.
	"""
	try:
		drug = _drug_dict[drug.upper()]
	except:
		raise ValueError("invalid drug")

	def uniconvert(s):
		if s == '\x00':
			return 0
		elif s == '\x01':
			return 1
		else:
			return None

	conn = pms.connect(host='localhost',
		user='******',
		passwd='',
		db='empath',
		charset='utf8',
		init_command='SET NAMES UTF8')
	cur = conn.cursor()

	# assemble the mother of all queries
	query = "SELECT c.id,c.body,m.count"
	for gen in _generics:
		query += (",m.%s" % gen.lower())
	query += " FROM Comments c JOIN Subreddits s on c.subreddit=s.subreddit "
	query += "JOIN Mentions m on c.id=m.id WHERE (m.count=1 OR m.count=2) "
	query += ("AND m.%s=True AND c.chunked=False" % drug.lower())
	if limit is not None:
		query += (" LIMIT %s" % limit)
	cur.execute(query)
	conn.close()

	for row in cur:
		post_id = row[0]
		body = row[1]
		count = row[2]
		drugs = np.array([uniconvert(d) for d in row[3:]])
		dmap = np.where(drugs == 1)
		drugs = [d.lower() for d in list(np.array(_generics)[dmap])]

		# clean body text
		body = body.lower()
		for drug in drugs:
			for remap in _gen_dict.get(drug.upper(),[drug.upper()]):
				body = body.replace(remap.lower(),drug.lower())

		trees,sentiments = build_tree(body,drugs)
		subtexts,mentions,precedence = map_subtrees(trees,drugs)

		for i,drug in enumerate(OrderedSet(precedence)):
			drugtext = []
			for subtext in subtexts[drug]:
				for word in subtext:
					drugtext.append(word)
			drugtext = [word for word in drugtext 
				if word not in set(stop_words())]
			sents = []
			for j,men in enumerate(mentions):
				if len(men) == 0:
					men = ['preamble']
				if drug in men:
					sents.append(sentiments[j])

			nbsent = classifier.prob_classify(dict([(word,True) for word in 
				drugtext])).prob('pos')	# probability positive

			data = (post_id,i,drug,drugtext,sents,nbsent)
			yield data