def preprocess_descriptions(): with open('abbreviation_data_set.csv') as csvfile: descriptions = [] names = [] readCSV = csv.reader(csvfile, delimiter=',') for row in readCSV: if len(row) > 1: descriptions.append(cleanse(row[1])) names.append(cleanse(row[0])) return descriptions, names
def runner(ticker, threshold=51): try: with open(bsv + ticker, "r") as f: data = f.read() data_bsv = ast.literal_eval(data) # with open(intra+ticker, "r") as f: # data = f.read() # f.close() except: return "999" # data_intra = ast.literal_eval(data) data_bsv = cleanse(data_bsv) # data_intra = cleanse(data_intra) delivery_ratio = sort_by_deliverable_parcent(data_bsv, "deliveryToTradedQuantity") if float(delivery_ratio) >= threshold: return delivery_ratio return "999"
upper_half = json.load(open('upper_half.json','rb')) lower_half = json.load(open('lower_half.json','rb')) comments = list(csv.DictReader(open('comments.csv','rb'))) comments = [comment for comment in comments if comment['Student Comment'] != 'None' and comment['Student Comment'] !='NA'] upper_half_comments = [comment for comment in comments if comment['Name'] in upper_half.keys()] lower_half_comments = [comment for comment in comments if comment['Name'] in lower_half.keys()] ap('Upper len :%d'%len(upper_half_comments)) ap('Lower len :%d'%len(lower_half_comments)) upper_half_vocabulary =' '.join(tech.cleanse(' '.join([comment['Student Comment'] for comment in upper_half_comments]))) lower_half_vocabulary = ' '.join(tech.cleanse(' '.join([comment['Student Comment'] for comment in lower_half_comments]))) upper_half_words = [word.lower() for word in nltk.word_tokenize(to_ascii(upper_half_vocabulary)) if word not in punkt and word not in stopwords] lower_half_words = [word.lower() for word in nltk.word_tokenize(to_ascii(lower_half_vocabulary)) if word not in punkt and word not in stopwords] upper_freqs = nltk.FreqDist(upper_half_words) lower_freqs = nltk.FreqDist(lower_half_words) print tech.weighted_jaccard_similarity(upper_freqs,lower_freqs) fig,axs = plt.subplots(ncols=2) for ax,data,label in zip(axs,[upper_freqs,lower_freqs],['Completers','Non-completers']): words,freqs = zip(*data.most_common(20))
import numpy as np from awesome_print import ap from nltk.util import ngrams from matplotlib import rcParams rcParams['text.usetex'] = True data = list(csv.DictReader(open('comments.csv','rb'))) categories = ['Reporter','Interpreter','Manager','Superior'] data_by_category = {} for category in categories: data_by_category[category] = {} comments = ' '.join([student['Student Comment'] for student in data if student['Physician Comment']==category]) data_by_category[category]['comments'] = tech.cleanse(comments) data_by_category[category]['fdist'] = nltk.FreqDist(data_by_category[category]['comments']) tech.save_ngrams(data_by_category[category]['fdist'].most_common(50),filename='comments-%s'%category.lower()) data_by_category[category]['bigram.fdist'] = nltk.FreqDist(ngrams(data_by_category[category]['comments'],2)) tech.save_ngrams(data_by_category[category]['bigram.fdist'].most_common(50),filename='comments-%s'%category.lower()) data_by_category[category]['count.comments'] = len(comments) data_by_category[category]['count.students'] = len([student['Student Comment'] for student in data if student['Physician Comment']==category]) jmat = np.array([[tech.jaccard_similarity(data_by_category[one]['comments'],data_by_category[two]['comments']) for one in categories] for two in categories]) np.savetxt('calculated-jaccard-similarity.tsv',jmat,delimiter='\t',fmt='%.04f')
base_path = '/Volumes/My Book/twittwer-stream/control' ''' data = [] for filename in os.listdir(base_path): with open(os.path.join(base_path,filename),'rb') as fid: data += [json.load(fid)] json.dump(data,open('/Volumes/My Book/twittwer-stream/amalgamated.json','wb')) #--1 Classify ''' data = json.load(open(os.path.join('/Volumes/My Book/twittwer-stream','amalgamated.json'),'rb')) #data = json.load(open('control_tweets.json','rb')) text = tech.cleanse([tweet['text'] for tweet in data]) #Why duplicating one tweet from test corpus? classifications = {} def iqr(data): try: return 0.5*(np.percentile(data,75) - np.percentile(data,25)) except: print data def get(lst,field): return [item[field] for item in lst] for i,tweet in enumerate(text): if langid.classify(' '.join(tweet))[0] == 'en': tweet,usernames,hashtags = tech.extract_tokens(tweet)
import os, nltk, csv import utils as tech import matplotlib.pyplot as plt CASE = os.path.join(os.getcwd(),'data','case') with open(os.path.join(CASE,'combined-deduplicated-rated.csv'),'r') as infile: items = [row for row in csv.reader(infile)] TEXT = 0 RATING = 2 text = tech.cleanse([item[0] for item in items]) tokens = [word for tweet in text for word in tweet] with open('rule-in-tokens.txt','wb') as outfile: for token in set(tokens): print>>outfile,token word_frequencies = nltk.FreqDist(tokens) fig = plt.figure() ax = fig.add_subplot(111) words,freqs = zip(*word_frequencies.most_common(25)) ax.plot(freqs,'k--',linewidth=2) tech.adjust_spines(ax) ax.set_xticks(range(len(words))) ax.set_xticklabels(words,rotation='vertical',weight='bold') ax.set_ylabel('Count') plt.tight_layout()
def find_expansion(): file = open("abbreviations.txt", "r") abbreviations = [] reduced_abbreviations = [] all_initials = [] # stripping newline and end dot for line in file: line = line.rstrip() if line[-1] == '.': line = line[:-1] abbreviations.append(line) # removing duplicate words without dot # could be made more efficient for acronym in abbreviations: flag = 0 for dacronym in abbreviations: if "." in dacronym and not "." in acronym and dacronym.replace( ".", "") == acronym: flag = 1 if flag == 0: reduced_abbreviations.append(acronym) # reduced_abbreviations = ['UPSC'] # removing redundancy reduced_abbreviations = list(set(reduced_abbreviations)) reduced_abbreviations.sort() # Tokenise into initials for word in reduced_abbreviations: initials = [] i = 0 while i < len(word): if word[i] == '.': i += 1 if i + 1 < len(word) and word[i + 1].islower(): initials.append(str(word[i] + word[i + 1])) i += 2 else: initials.append(word[i]) i += 1 all_initials.append(initials) # remove punctuations with open('abbreviation_data_set.csv') as csvfile: descriptions = [] readCSV = csv.reader(csvfile, delimiter=',') for row in readCSV: if len(row) > 1: descriptions.append(cleanse(row[1])) counter = 0 # all_initials = [['I', 'I', 'T'], ['I', 'T'], ['H', 'I', 'I']] # descriptions = ["hey indian institute technology and i i t science"] fields = ['Abbreviation', 'Expansions'] with open('extracted_acronyms.csv', 'w') as csvfile: csvwriter = csv.writer(csvfile) csvwriter.writerow(fields) for initials in all_initials: row = [] m = len(initials) matchings = [] for description in descriptions: words = description.split(' ') n = len(words) for k in range(0, n - m + 1): i = 0 j = k for j in range(k, k + m): if len(initials[i]) == 1: if words[j][0] == initials[i][0].lower(): i += 1 else: break else: if len(words[j]) > 1 and words[j][0] == initials[ i][0].lower( ) and words[j][1] == initials[i][1]: i += 1 else: break if i == m: matching = [] x = 0 while x < m: matching.append(words[k + x]) x += 1 matchings.append(matching) for matching in matchings: str1 = ' '.join(matching) row.append(str1) cnt = Counter(row) added = 0 row = [] row.append(reduced_abbreviations[counter]) for tup in cnt.most_common(): row.append(tup[0]) added += 1 if added > 20: break csvwriter.writerow(row) counter += 1 print(counter)
import string, csv, itertools, nltk import Graphics as artist import utils as tech from nltk.util import ngrams from awesome_print import ap data = list(csv.DictReader(open('comments.csv','rb'))) text = tech.cleanse(' '.join(itertools.chain(record['Student Comment'] for record in data))) tech.savelines(text,filename='all-words-cleansed') fdist = nltk.FreqDist(text) tech.savelines(zip(*fdist.most_common(100)),filename='overall-frequencies-cleansed') bigram_fdist = nltk.FreqDist(ngrams(text,2)) tech.savelines(zip(*bigram_fdist.most_common(100)),filename='bigram-frequencies-cleansed') artist.frequency_plot(fdist,filename='overall-frequency-distribution')
IRRELEVANT_SPECIFIC_SYNSETS = {synset for token in IRRELEVANT_SPECIFIC for synset in wn.synsets(token)} def assign_category(list_of_tokens,threshold=75): return np.percentile() def score(list_of_tokens): if len(list_of_tokens) > 0: list_of_tokens = set(list_of_tokens) return len(RELEVANT_SPECIFIC & list_of_tokens)/float(len(list_of_tokens)) else: return np.nan #Don't forget to do the controls ## Could look for words sufficiently semantically similar to the tokens, first see if direct similarity works test_corpus = tech.cleanse(set([item.split('|')[0] for item in open('test-high-prevalence.txt','r').read().splitlines()])) #random.shuffle(test_corpus) #print len(test_corpus) words = set(nltk.word_tokenize(' '.join(itertools.chain.from_iterable(test_corpus)))) omitted = [] senses = [] irrelevant = [] for word in words: print word if len(wn.synsets(word)) > 0: senses += [len(set(wn.synsets(word)) & RELEVANT_SPECIFIC_SYNSETS)/float(len(wn.synsets(word)))] irrelevant += [len(set(wn.synsets(word)) & IRRELEVANT_SPECIFIC_SYNSETS)/float(len(wn.synsets(word)))] else: omitted += [word]
if len(one & two) == 0: return 0 else: return len(one & two)/float(len(one | two)) TEXT = 0 RATING = 2 with open(os.path.join(CASE,'combined-deduplicated-rated.csv'),'r') as infile: items = [row for row in csv.reader(infile)] relevant, irrelevant = [], [] for item in items: relevant.append(item[TEXT]) if int(item[RATING]) == 1 else irrelevant.append(item[TEXT]) relevant = list(itertools.chain.from_iterable(tech.cleanse(relevant))) relevant += ['purple'] irrelevant = list(itertools.chain.from_iterable(tech.cleanse(irrelevant))) ''' Numerator 527 Denominator 3832 0.13752609603340293 ''' for key,value in [('relevant',relevant),('irrelevant',irrelevant)]: with open('%s-tokens'%key,'w') as outfile: for token in value: print>>outfile,token
def extract_grade(student): return {'name':student['name'].split(',')[0].capitalize(), 'grade':student['grade']} ratings = [extract_grade(student) for student in list(csv.DictReader(open('data.csv','rb')))] data = list(csv.DictReader(open('comments.csv','rb'))) data = [student for student in data if student['Physician Comment']!='NA' and student['Physician Comment'] !='Cedar' and student['Physician Comment'] != 'X'] rating_names = [student['name'] for student in ratings] data_names = list(set([student['Name'] for student in data])) #cleans text for classifying for i,student in enumerate(data): text = tech.cleanse(student['Student Comment']) data[i]['Student Comment'] = text #split into testing and training sets n = len(data) test_idx = random.sample(xrange(n),int(n*0.5)) train_idx = set(xrange(n))-set(test_idx) test_set = filter(lambda item: item[1] ,map(extract_featurelabel,[data[i] for i in test_idx])) train_set = filter(lambda item: item[1] ,map(extract_featurelabel,[data[i] for i in train_idx])) #classifier = NaiveBayesClassifier.train(train_set) classif.train(test_set) #Compute accuracy test_data,test_label = zip(*test_set) train_data,train_label = zip(*train_set)
./data/source/keyword ''' #---LOAD DATA if not os.path.isfile(CORPUS_FILENAME): #More expressive than itertools.product, small loops --> no important speed or memory difference corpus = {} for source in sources: corpus[source] = {} for disease in keywords: path = os.path.join(base,source,disease) text = ' '.join(open(os.path.join(path,filename),READ).read() for filename in os.listdir(path) if not os.path.isdir(os.path.join(path,filename))) text = text.replace('.',' ').replace("\n"," ") text = re.sub(r"[^\x00-\x7F]","",text) #Regexp faster than iterating through string to remove non-ASCII corpus[source][disease] = list(tech.cleanse(text)) #Cleanse returns type set. Type set is not JSON serializable. Type list is. json.dump(corpus,open(CORPUS_FILENAME,WRITE)) else: corpus = json.load(open(CORPUS_FILENAME,READ)) #--- CALCULATE JACCARD SIMILARITY source_rubric = [[source for source in sources] for source in sources] filenames = ['jaccard-similarity-%s'%disease for disease in keywords] filenames += ['jaccard-similarities.json']
synsets = {'positive' :{synset for token in informative_tokens['positive']['tokens'] for synset in wn.synsets(token)}, 'negative':{synset for token in informative_tokens['negative']['tokens'] for synset in wn.synsets(token)}} data = [item.strip().split('|') for item in open('evaluation-rating','r').read().splitlines()] tweets,my_ratings = zip(*[(item[0],int(item[2])) for item in data if len(item)>2]) positive, negative = [],[] for i in xrange(len(tweets)): if langid.classify(tweets[i])[0] == 'en': positive.append(tweets[i]) if my_ratings[i] == 1 else negative.append(tweets[i]) positive,p_users = tech.extract_tokens([' '.join(tweet) for tweet in tech.cleanse(positive)]) negative,n_users = tech.extract_tokens([' '.join(tweet) for tweet in tech.cleanse(negative)]) tmp = informative_tokens['positive']['tokens'] tmp += list(positive) del tmp tmp = informative_tokens['negative']['tokens'] tmp += list(negative) del tmp tmp = informative_tokens['positive']['usernames'] tmp += list(p_users) del tmp tmp = informative_tokens['positive']['usernames']