def preprocess_descriptions():

	with open('abbreviation_data_set.csv') as csvfile:
		
		descriptions = []
		names = []
		readCSV = csv.reader(csvfile, delimiter=',')
		for row in readCSV:
			if len(row) > 1:
				descriptions.append(cleanse(row[1]))
			names.append(cleanse(row[0]))

	return descriptions, names
Esempio n. 2
0
def runner(ticker, threshold=51):
    try:
        with open(bsv + ticker, "r") as f:
            data = f.read()
        data_bsv = ast.literal_eval(data)

    #     with open(intra+ticker, "r") as f:
    #         data = f.read()
    #     f.close()
    except:
        return "999"
    # data_intra = ast.literal_eval(data)
    data_bsv = cleanse(data_bsv)
    # data_intra = cleanse(data_intra)
    delivery_ratio = sort_by_deliverable_parcent(data_bsv,
                                                 "deliveryToTradedQuantity")
    if float(delivery_ratio) >= threshold:
        return delivery_ratio
    return "999"
Esempio n. 3
0
upper_half = json.load(open('upper_half.json','rb'))
lower_half = json.load(open('lower_half.json','rb'))

comments = list(csv.DictReader(open('comments.csv','rb')))

comments = [comment for comment in comments 
		if comment['Student Comment'] != 'None' and comment['Student Comment'] !='NA']

upper_half_comments = [comment for comment in comments if comment['Name'] in upper_half.keys()]
lower_half_comments = [comment for comment in comments if comment['Name'] in lower_half.keys()]

ap('Upper len :%d'%len(upper_half_comments))
ap('Lower len :%d'%len(lower_half_comments))

upper_half_vocabulary  =' '.join(tech.cleanse(' '.join([comment['Student Comment'] for comment in upper_half_comments])))
lower_half_vocabulary = ' '.join(tech.cleanse(' '.join([comment['Student Comment'] for comment in lower_half_comments])))

upper_half_words = [word.lower() for word in nltk.word_tokenize(to_ascii(upper_half_vocabulary)) 
					if word not in punkt and word not in stopwords]
lower_half_words = [word.lower() for word in nltk.word_tokenize(to_ascii(lower_half_vocabulary)) 
					if word not in punkt and word not in stopwords]

upper_freqs = nltk.FreqDist(upper_half_words)
lower_freqs = nltk.FreqDist(lower_half_words)

print tech.weighted_jaccard_similarity(upper_freqs,lower_freqs)

fig,axs = plt.subplots(ncols=2)
for ax,data,label in zip(axs,[upper_freqs,lower_freqs],['Completers','Non-completers']):
	words,freqs = zip(*data.most_common(20))
import numpy as np 

from awesome_print import ap 
from nltk.util import ngrams
from matplotlib import rcParams

rcParams['text.usetex'] = True

data = list(csv.DictReader(open('comments.csv','rb')))
categories = ['Reporter','Interpreter','Manager','Superior']

data_by_category = {}
for category in categories:
	data_by_category[category] = {}
	comments = ' '.join([student['Student Comment'] for student in data if student['Physician Comment']==category])
	data_by_category[category]['comments'] = tech.cleanse(comments)
	data_by_category[category]['fdist'] = nltk.FreqDist(data_by_category[category]['comments'])
	tech.save_ngrams(data_by_category[category]['fdist'].most_common(50),filename='comments-%s'%category.lower())
	
	data_by_category[category]['bigram.fdist'] = nltk.FreqDist(ngrams(data_by_category[category]['comments'],2))
	tech.save_ngrams(data_by_category[category]['bigram.fdist'].most_common(50),filename='comments-%s'%category.lower())

	data_by_category[category]['count.comments'] = len(comments)
	data_by_category[category]['count.students'] = len([student['Student Comment'] 
					for student in data if student['Physician Comment']==category])

jmat = np.array([[tech.jaccard_similarity(data_by_category[one]['comments'],data_by_category[two]['comments'])
				for one in categories]
				for two in categories])
	
np.savetxt('calculated-jaccard-similarity.tsv',jmat,delimiter='\t',fmt='%.04f')
Esempio n. 5
0
base_path = '/Volumes/My Book/twittwer-stream/control'

'''
data = []
for filename in os.listdir(base_path):
	with open(os.path.join(base_path,filename),'rb') as fid:
		data  += [json.load(fid)]

json.dump(data,open('/Volumes/My Book/twittwer-stream/amalgamated.json','wb'))
#--1 Classify
'''

data = json.load(open(os.path.join('/Volumes/My Book/twittwer-stream','amalgamated.json'),'rb'))
#data = json.load(open('control_tweets.json','rb'))

text = tech.cleanse([tweet['text'] for tweet in data])

#Why duplicating one tweet from test corpus?
classifications = {}

def iqr(data):
	try:
		return 0.5*(np.percentile(data,75) - np.percentile(data,25))
	except:
		print data
def get(lst,field):
	return [item[field] for item in lst]

for i,tweet in enumerate(text):
	if langid.classify(' '.join(tweet))[0] == 'en':
	  	tweet,usernames,hashtags =  tech.extract_tokens(tweet)
Esempio n. 6
0
import os, nltk, csv

import utils as tech
import matplotlib.pyplot as plt 

CASE = os.path.join(os.getcwd(),'data','case')

with open(os.path.join(CASE,'combined-deduplicated-rated.csv'),'r') as infile:
	items = [row for row in csv.reader(infile)]


TEXT = 0
RATING = 2
text = tech.cleanse([item[0] for item in items])
tokens = [word for tweet in text for word in tweet]

with open('rule-in-tokens.txt','wb') as outfile:
	for token in set(tokens):
		print>>outfile,token

word_frequencies = nltk.FreqDist(tokens)

fig = plt.figure()
ax = fig.add_subplot(111)
words,freqs = zip(*word_frequencies.most_common(25))
ax.plot(freqs,'k--',linewidth=2) 
tech.adjust_spines(ax)
ax.set_xticks(range(len(words)))
ax.set_xticklabels(words,rotation='vertical',weight='bold')
ax.set_ylabel('Count')
plt.tight_layout()
Esempio n. 7
0
def find_expansion():

    file = open("abbreviations.txt", "r")

    abbreviations = []
    reduced_abbreviations = []
    all_initials = []

    # stripping newline and end dot
    for line in file:
        line = line.rstrip()
        if line[-1] == '.':
            line = line[:-1]
        abbreviations.append(line)

    # removing duplicate words without dot
    # could be made more efficient
    for acronym in abbreviations:
        flag = 0
        for dacronym in abbreviations:
            if "." in dacronym and not "." in acronym and dacronym.replace(
                    ".", "") == acronym:
                flag = 1
        if flag == 0:
            reduced_abbreviations.append(acronym)

    # reduced_abbreviations = ['UPSC']
    # removing redundancy
    reduced_abbreviations = list(set(reduced_abbreviations))
    reduced_abbreviations.sort()

    # Tokenise into initials
    for word in reduced_abbreviations:
        initials = []
        i = 0
        while i < len(word):
            if word[i] == '.':
                i += 1

            if i + 1 < len(word) and word[i + 1].islower():
                initials.append(str(word[i] + word[i + 1]))
                i += 2

            else:
                initials.append(word[i])
                i += 1
        all_initials.append(initials)

    # remove punctuations
    with open('abbreviation_data_set.csv') as csvfile:

        descriptions = []
        readCSV = csv.reader(csvfile, delimiter=',')
        for row in readCSV:
            if len(row) > 1:
                descriptions.append(cleanse(row[1]))

    counter = 0
    # all_initials = [['I', 'I', 'T'], ['I', 'T'], ['H', 'I', 'I']]
    # descriptions = ["hey indian institute technology and i i t science"]
    fields = ['Abbreviation', 'Expansions']
    with open('extracted_acronyms.csv', 'w') as csvfile:

        csvwriter = csv.writer(csvfile)
        csvwriter.writerow(fields)

        for initials in all_initials:

            row = []
            m = len(initials)
            matchings = []
            for description in descriptions:
                words = description.split(' ')
                n = len(words)
                for k in range(0, n - m + 1):
                    i = 0
                    j = k
                    for j in range(k, k + m):
                        if len(initials[i]) == 1:
                            if words[j][0] == initials[i][0].lower():
                                i += 1
                            else:
                                break
                        else:
                            if len(words[j]) > 1 and words[j][0] == initials[
                                    i][0].lower(
                                    ) and words[j][1] == initials[i][1]:
                                i += 1
                            else:
                                break
                        if i == m:
                            matching = []
                            x = 0
                            while x < m:
                                matching.append(words[k + x])
                                x += 1

                            matchings.append(matching)

            for matching in matchings:
                str1 = ' '.join(matching)
                row.append(str1)

            cnt = Counter(row)
            added = 0
            row = []
            row.append(reduced_abbreviations[counter])
            for tup in cnt.most_common():
                row.append(tup[0])
                added += 1
                if added > 20:
                    break

            csvwriter.writerow(row)
            counter += 1
            print(counter)
import string, csv, itertools, nltk

import Graphics as artist 
import utils as tech

from nltk.util import ngrams
from awesome_print import ap 

data = list(csv.DictReader(open('comments.csv','rb')))
text = tech.cleanse(' '.join(itertools.chain(record['Student Comment'] for record in data)))

tech.savelines(text,filename='all-words-cleansed')

fdist = nltk.FreqDist(text)
tech.savelines(zip(*fdist.most_common(100)),filename='overall-frequencies-cleansed')

bigram_fdist = nltk.FreqDist(ngrams(text,2))
tech.savelines(zip(*bigram_fdist.most_common(100)),filename='bigram-frequencies-cleansed')

artist.frequency_plot(fdist,filename='overall-frequency-distribution')
Esempio n. 9
0
IRRELEVANT_SPECIFIC_SYNSETS = {synset for token in IRRELEVANT_SPECIFIC for synset in wn.synsets(token)}

def assign_category(list_of_tokens,threshold=75):
	return np.percentile()

def score(list_of_tokens):
	if len(list_of_tokens) > 0:
		list_of_tokens = set(list_of_tokens)
		return len(RELEVANT_SPECIFIC & list_of_tokens)/float(len(list_of_tokens))
	else:
		return np.nan
#Don't forget to do the controls

## Could look for words sufficiently semantically similar to the tokens, first see if direct similarity works

test_corpus = tech.cleanse(set([item.split('|')[0] for item in open('test-high-prevalence.txt','r').read().splitlines()]))
#random.shuffle(test_corpus)

#print len(test_corpus)
words = set(nltk.word_tokenize(' '.join(itertools.chain.from_iterable(test_corpus))))
omitted = []
senses = []
irrelevant = []
for word in words:
	print word
	if len(wn.synsets(word)) > 0:
		senses += [len(set(wn.synsets(word)) & RELEVANT_SPECIFIC_SYNSETS)/float(len(wn.synsets(word)))]
		irrelevant += [len(set(wn.synsets(word)) & IRRELEVANT_SPECIFIC_SYNSETS)/float(len(wn.synsets(word)))]
		
	else:
		omitted += [word]	
Esempio n. 10
0
	if len(one & two) == 0:
		return 0 
	else:
		return len(one & two)/float(len(one | two))

TEXT = 0
RATING = 2
with open(os.path.join(CASE,'combined-deduplicated-rated.csv'),'r') as infile:
	items = [row for row in csv.reader(infile)]

relevant, irrelevant  = [], []

for item in items:
	relevant.append(item[TEXT]) if int(item[RATING]) == 1 else irrelevant.append(item[TEXT])

relevant = list(itertools.chain.from_iterable(tech.cleanse(relevant)))
relevant += ['purple']
irrelevant =  list(itertools.chain.from_iterable(tech.cleanse(irrelevant)))

'''
Numerator 527
Denominator 3832
0.13752609603340293
'''


for key,value in [('relevant',relevant),('irrelevant',irrelevant)]:
	with open('%s-tokens'%key,'w') as outfile:
		for token in value:
			print>>outfile,token
Esempio n. 11
0
def extract_grade(student):
	return {'name':student['name'].split(',')[0].capitalize(),
			'grade':student['grade']}

ratings = [extract_grade(student) for student in list(csv.DictReader(open('data.csv','rb')))]

data = list(csv.DictReader(open('comments.csv','rb')))
data = [student for student in data if student['Physician Comment']!='NA'
	and student['Physician Comment'] !='Cedar' and student['Physician Comment'] != 'X']

rating_names = [student['name'] for student in ratings]
data_names = list(set([student['Name'] for student in data]))
#cleans text for classifying
for i,student in enumerate(data):
	text = tech.cleanse(student['Student Comment'])
	data[i]['Student Comment'] = text

#split into testing and training sets
n = len(data)
test_idx = random.sample(xrange(n),int(n*0.5))
train_idx = set(xrange(n))-set(test_idx)

test_set = filter(lambda item: item[1] ,map(extract_featurelabel,[data[i] for i in test_idx]))
train_set = filter(lambda item: item[1] ,map(extract_featurelabel,[data[i] for i in train_idx]))

#classifier = NaiveBayesClassifier.train(train_set)
classif.train(test_set)
#Compute accuracy
test_data,test_label = zip(*test_set)
train_data,train_label = zip(*train_set)
     	./data/source/keyword 
'''

#---LOAD DATA
if not os.path.isfile(CORPUS_FILENAME):
	#More expressive than itertools.product, small loops --> no important speed or memory difference
	corpus = {}
	for source in sources:
		corpus[source] = {}
		for disease in keywords:
				path = os.path.join(base,source,disease)
				text = ' '.join(open(os.path.join(path,filename),READ).read() for filename in os.listdir(path)
										if not os.path.isdir(os.path.join(path,filename)))
				text = text.replace('.',' ').replace("\n"," ")
				text = re.sub(r"[^\x00-\x7F]","",text) #Regexp faster than iterating through string to remove non-ASCII
				corpus[source][disease]  = list(tech.cleanse(text)) 
				#Cleanse returns type set. Type set is not JSON serializable. Type list is.
	json.dump(corpus,open(CORPUS_FILENAME,WRITE))

else:
	corpus = json.load(open(CORPUS_FILENAME,READ))


#--- CALCULATE JACCARD SIMILARITY

source_rubric = [[source for source in sources] 
						 for source in sources]


filenames = ['jaccard-similarity-%s'%disease for disease in keywords]
filenames += ['jaccard-similarities.json']
Esempio n. 13
0
synsets = {'positive' :{synset for token in informative_tokens['positive']['tokens'] for synset in wn.synsets(token)},
			'negative':{synset for token in informative_tokens['negative']['tokens'] for synset in wn.synsets(token)}}


data = [item.strip().split('|') for item in open('evaluation-rating','r').read().splitlines()]

tweets,my_ratings = zip(*[(item[0],int(item[2])) for item in data if len(item)>2])

positive, negative = [],[]

for i in xrange(len(tweets)):
	if langid.classify(tweets[i])[0] == 'en':
		positive.append(tweets[i]) if my_ratings[i] == 1 else negative.append(tweets[i])

positive,p_users = tech.extract_tokens([' '.join(tweet) for tweet in tech.cleanse(positive)])
negative,n_users = tech.extract_tokens([' '.join(tweet) for tweet in tech.cleanse(negative)])

tmp = informative_tokens['positive']['tokens']
tmp += list(positive)
del tmp

tmp = informative_tokens['negative']['tokens']
tmp += list(negative)
del tmp

tmp = informative_tokens['positive']['usernames']
tmp += list(p_users)
del tmp

tmp = informative_tokens['positive']['usernames']