ages = data['Cited Person Age'] min_age, max_age=np.min(ages),np.max(ages) l=[] c=[] for i in range(min_age, max_age+1): l= ages[ages==i] h=len(l) c.append([h,i,i+1]) d=np.array(c) range=int(np.max(ages)-np.min(ages)) # tuple(map(tuple, c)) e=[] for j in c: e.append(tuple(j)) #hist=pygal.Histogram() #hist.add('ages',e) #hist.render() fig = plt.figure() #ax = fig.add_subplot(1, 1, 1) #plt.hist(ages, bins=np.max(ages)-np.min(ages)) hist=pygal.Histogram() hist.add('ages',e) hist.render_to_file('histages2.svg') #plt.show()
def index(): try: form = SearchForm() if form.validate_on_submit(): tweets, text_for_cloud = get_tweets(form.searchword.data, form.number_of_results.data) # creating wordcloud and storing to static stop_words = ["https", "co", "RT"] + list(wordcloud.STOPWORDS) word_cloud = wordcloud.WordCloud(stopwords=stop_words, background_color="rgba(255, 255, 255, 0)", mode="RGBA") word_cloud.generate(text_for_cloud) # generating random numbers to the end of the image name to ensure that it is updated (not saved in cache) random_part = str(random.randint(1,101)) path_to_img = f'static/word_cloud{random_part}.png' # before saving the file into the path specified, deleting possible old images (png format) # 1. Get a list of all the file paths that ends with .png from in specified directory fileList = glob.glob('static/word_cloud*.png') # 2. delete "old" png images for filePath in fileList: try: os.remove(filePath) except: print("Error while deleting file : ", filePath) word_cloud.to_file(path_to_img) # calculating sentiments for tweets and adding these to dictionaries, also adding counts to bin bin_counts = [0,0,0,0,0,0,0,0,0,0] for i in range(len(tweets)): tweets[i]['sentiment'] = classify(tweets[i]['tweet_text']) if tweets[i]['sentiment'] < 0.1: bin_counts[0] += 1 elif tweets[i]['sentiment'] < 0.2: bin_counts[1] += 1 elif tweets[i]['sentiment'] < 0.3: bin_counts[2] += 1 elif tweets[i]['sentiment'] < 0.4: bin_counts[3] += 1 elif tweets[i]['sentiment'] < 0.5: bin_counts[4] += 1 elif tweets[i]['sentiment'] < 0.6: bin_counts[5] += 1 elif tweets[i]['sentiment'] < 0.7: bin_counts[6] += 1 elif tweets[i]['sentiment'] < 0.8: bin_counts[7] += 1 elif tweets[i]['sentiment'] < 0.9: bin_counts[8] += 1 else: bin_counts[9] += 1 sorted_tweets = sorted(tweets, key = lambda i: i['sentiment'], reverse=True) # creating histogram from pygal.style import Style custom_style = Style( background='transparent', plot_background='transparent', colors=('#79bdd8','#79bdd8')) graph = pygal.Histogram(show_legend=False, title=u'Distribution of the sentiment predictions', x_title='0 = most negative 1 = most positive', style=custom_style) graph.add('Narrow bars', [(bin_counts[0], 0, 0.1), (bin_counts[1], 0.1, 0.2), (bin_counts[2], 0.2, 0.3), (bin_counts[3], 0.3, 0.4),(bin_counts[4], 0.4, 0.5),(bin_counts[5], 0.5, 0.6), (bin_counts[6], 0.6, 0.7),(bin_counts[7], 0.7, 0.8),(bin_counts[8], 0.8, 0.9),(bin_counts[9], 0.9, 1)]) graph_data = graph.render_data_uri() return render_template('sort_by_sentiment_app.html', tweets=sorted_tweets, form=form, graph_data=graph_data, cloud_img_path=path_to_img) return render_template('sort_by_sentiment_app.html', form=form) except Exception as e: return(str(e))
ages_freq_list = pd.DataFrame({ 'age': ages_freq.index, 'count': ages_freq.values }) ages_freq_list.sort_values(['age'], ascending=True, inplace=True) ages_freq_list['bar'] = (ages_freq_list['age'] + 1).astype(int) #Creating List of Tuples for Histogram plot subset = ages_freq_list[['count', 'age', 'bar']] tuples = [tuple(x) for x in subset.values] #Histogram Plot from pygal.style import BlueStyle, LightStyle, DarkGreenBlueStyle hist = pygal.Histogram(legend_at_bottom=True, human_readable=True, title='first-quarter citations histogram', x_title='Age', y_title='No. of Violators') hist.add('Narrow bars', tuples) hist.render_to_file('histogram.svg') #As we can see we see some 116 year-old violators! #This is probably an error in the data, so we can remove these data points easily and plot the histogram again: tuples = [i for i in tuples if i[1] < 100] #Histogram Plot without outlier from pygal.style import BlueStyle, LightStyle, DarkGreenBlueStyle hist = pygal.Histogram(legend_at_bottom=True, human_readable=True, title='first-quarter citations histogram',
i = LINE_ONE_ELEMS for val in user0data: timesWaitedFreq[val] += 1 # calculate probabilities from freqs for i in range(longestWait + 1): probOfTime[i] = (timesWaitedFreq[i] / len(user0data)) * MAKE_PERCENT # load bar set for histo histoBarSet = [(0, 0, 0)] * (len(user0data)) for i in range(len(probOfTime)): histoBarSet[i] = (probOfTime[i], i, i + 1) # print("height: ",probOfTime[i],". x(",i,",",i+1,").") # make histo pmf = pygal.Histogram( title=u'PMF | x-axis: time taken (ms), y-axis: probability (%)') pmf.add('', histoBarSet) pmf.render_to_file('q1.svg') ################################### (1) ii) ################################### ones = 0 allConnections = 0 for i in range(len(timesWaitedFreq)): if (i <= 10): ones += timesWaitedFreq[i] allConnections += timesWaitedFreq[i] probOfOne = (ones / allConnections) * MAKE_PERCENT print("Probability(X = 1): ", probOfOne)
def analyse(object_array, filename, profilesDict, logFile, outputDir): """Take the csv file and add each line as an ISOLATE object into object_array """ from classex1 import ISOLATE from collections import defaultdict import pygal from math import log import os import numpy as np leftDict = {} for i in profilesDict: if len(profilesDict[i]) > 2: leftDict[profilesDict[i][0]] = [i, 'profile1'] leftDict[profilesDict[i][1]] = [i, 'profile2'] else: leftDict[profilesDict[i][0]] = [i, 'SINGLE'] hmmerOutput = open(filename, 'r') E_counter = 0 good_counter = 0 total_counter = 0 e_values = defaultdict(list) # list of e_values counter = 0 # counter to miss first line #everything in table matched or un matched with e_value over 0.00005 for item in hmmerOutput: counter += 1 parts = item.split(",") total_counter += 1 profileName = parts[0].replace('\n', '') if counter > 1 and profileName in leftDict.keys(): e_values[profileName].append(-log(float(parts[11]))) if counter > 1 and float( parts[11]) < float(0.00005) and profileName in leftDict.keys(): good_counter += 1 y = ISOLATE() y.found = [] id_parts = parts[3].split('_') y.identifier = '_'.join(id_parts[:2]) y.HNHstart = int(parts[17]) y.HNHstop = int(parts[18]) y.HNHcontig = parts[3] y.coltype = leftDict[profileName][0] y.found.append(parts[0].replace("\n", "")) y.contiglength = int(parts[5]) y.MATCH = leftDict[profileName][1] y.found.append(parts[0].replace("\n", "")) object_array.append(y) else: E_counter += 1 # 'Total number of hits: {}<br>Hits that did not pass the E counter threshold: {}<br> \ # Passed hits: {}<br> Unaccounted for hits: {}'.format(total_counter,E_counter, # good_counter,int(total_counter-(E_counter+good_counter))), logFile hmmerOutput.close() histDict = defaultdict(list) #Plotting the e-values as a histogram. Needs a bit of reformatting to deal #with pygals plotting format for profile in e_values: if len(e_values[profile]) < 100: binsN = len(e_values[profile]) else: binsN = 100 x = np.histogram(e_values[profile], bins=binsN) for i in range(len(x[0])): histDict[profile].append((x[0][i], x[1][i], x[1][i + 1])) hist = pygal.Histogram(stroke=False, x_title='E value (-log)') hist.title = 'E-value distribution' for profile in histDict.keys(): hist.add(profile, histDict[profile]) EvaluePath = os.path.join(outputDir, 'E_valueHist.svg') hist.render_to_file(EvaluePath) logFile.EvalueString(total_counter=total_counter, E_counter=E_counter, good_counter=good_counter, Evaluepygal=EvaluePath) return object_array
def makehist(x, df): hist = pygal.Histogram() hist.add(x, df[x]) return hist.render_data_uri()
import pandas as pd import matplotlib.pyplot as plt import pygal data = pd.read_csv('2016-first-quarter-citations.csv') #Dropping rows that has any missing values data = data.dropna(how='any') data.shape #getting Age into a list age=data['Cited Person Age'] #maximum age max(age) #minimum age min(age) #Creating the tuples (height,start location,end location) for the Pygal Histogram) #From the 0 to the maximum value of Age (0,117) agelist=[0]*120 for a in age: agelist[int(a)]+=1 agetuples=[] for age in range(0,117): agetuples.append((agelist[age],age,age+1)) hist = pygal.Histogram(title=u'Histogram of Citations across Age - Q1 2016', x_title='Age',y_title="Total Citations",show_legend=False,tooltip_border_radius=10) hist.add("Age Histogram", agetuples[:116]) hist.render_to_file('ageHistogram-corrected.svg')from __future__ import division, print_function
19.4, # 18 14.6, # 19 10.4, # 20 6.9, # 21 4.2, # 22 2.1, # 23 0.7, # 24 ) probs_d20 = [100.0 - (x - 1) * 5 for x in range(1, 21)] + [0.0, 0.0, 0.0, 0.0] bars_2d12 = [] bars_d20 = [] for i in range(1, 24): bars_2d12.append((probs_2d10[i - 1], i - 0.5, i + 0.5)) bars_d20.append((probs_d20[i - 1], i - 0.5, i + 0.5)) # hist = pygal.Histogram( hist = pygal.Histogram( human_readable=True, title="Probability of Getting At Least n on a Roll; 2d12 vs d20", x_title="n", y_title="Probability", legend_at_bottom=True, legend_at_bottom_columns=2, ) hist.add('2d12', bars_2d12) hist.add('d20', bars_d20) hist.render_to_png('to_hit_2d12_vs_d20.png')
from __future__ import division, print_function import numpy as np import pandas as pd data = pd.read_csv('2016-first-quarter-citations.csv') data = data.dropna(how='any') data.shape data['DateTime Issued'] = data.apply(lambda row: datetime.strptime( row['Date Issued'] + ':' + row['Time Issued'], '%m/%d/%y:%I:%M %p'), axis=1) data['Day of Week Issued'] = data.apply( lambda row: datetime.strftime(row['DateTime Issued'], '%A'), axis=1) ages = data['Cited Person Age'] bin_size = (np.max(ages) - np.min(ages)) / 100 age_list = [] for i in range(100): if i == 99: age_list.append( (len(ages[(ages >= i + 16) & (ages <= i + 17)]), i + 16, i + 17)) else: age_list.append( (len(ages[(ages >= i + 16) & (ages < i + 17)]), i + 16, i + 17)) age_list import pygal as pg hist = pg.Histogram() hist.add('Age Histogram', age_list) hist.render_to_file('hist_chart.svg')
def graph2(): hist = py.Histogram() hist.add('Wide bars', [(5, 0, 10), (4, 5, 13), (2, 0, 15)]) hist.add('Narrow bars', [(10, 1, 2), (12, 4, 4.5), (8, 11, 13)]) return Response(response=hist.render(), content_type='image/svg+xml')
def hello(): hashtag = request.form.get('hashtag') print(hashtag) global corpus # In[25]: # http://stackoverflow.com/a/13752628/6762004 RE_EMOJI = re.compile('[\U00010000-\U0010ffff]', flags=re.UNICODE) def strip_emoji(text): return RE_EMOJI.sub(r'', text) # In[41]: print("getting the tweets") arr = [] counter = 0 ####input your credentials here consumer_key = 'Qv0Kw5qmwlZAqk93p6R2OFI2X' consumer_secret = 'uAfuPdO4yCMQ48rlHXznXjjAGIZskHlELInqKtX5dZaQ0AJZPI' access_token = '1089453692137455616-6PHQDg2q3Dk6MPOD8PnsUqRTOaQywS' access_token_secret = 'jmOnoyjzjyZ0dUqycPcqVBeHqiFRlZNZ6pqMQKVMiCNUl' auth = tweepy.OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_token_secret) api = tweepy.API(auth, wait_on_rate_limit=True) #####United Airlines # Open/Create a file to append data # csvFile = open('ua3.csv', 'a') #Use csv Writer # csvWriter = csv.writer(csvFile) for tweet in tweepy.Cursor(api.search, q='#' + hashtag, count=100, lang="hi", since="2017-04-03").items(): print(tweet.created_at, tweet.text) counter = counter + 1 if (counter == 100): break arr.append(translator.translate(strip_emoji(tweet.text)).text) # csvWriter.writerow([tweet.created_at, translator.translate(strip_emoji(tweet.text)).text.encode("utf-8")]) # In[42]: for i in range(0, len(arr)): review = re.sub("[^a-zA-Z]", " ", arr[i]) review = review.lower() review = review.split() a = [] for word in review: if (word in stopwords.words("english")): m = 1 else: s = ps.stem(word) a.append(s) review = a review = ' '.join(review) corpus.append(review) # In[43]: cv = CountVectorizer(max_features=1500) x = cv.fit_transform(corpus).toarray() y = dataset.iloc[:, 1].values # X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.30, random_state = 0) X_train = x[:-len(arr)] X_test = x[-len(arr):] y_train = y # Fitting Naive Bayes to the Training set classifier = RandomForestClassifier() classifier.fit(X_train, y_train) # Predicting the Test set results # y_pred = classifier.predict(X_test) # # Making the Confusion Matrix # from sklearn.metrics import confusion_matrix # cm = confusion_matrix(y_test, y_pred) arr2 = classifier.predict(X_test) # print(arr2) # In[44]: arr2 = arr2.tolist() # In[45]: count1 = arr2.count(1) count_1 = arr2.count(-1) count_neu = arr2.count(0) # In[46]: print("pos=" + str(count1) + " neg=" + str(count_1) + " neu=" + str(count_neu)) # In[47]: neg = count_1 pos = count1 neu = count_neu # In[48]: classifier = KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski') classifier.fit(X_train, y_train) arr2 = classifier.predict(X_test) arr2 = arr2.tolist() count1KNN = arr2.count(1) count_1KNN = arr2.count(-1) count_neuKNN = arr2.count(0) classifier = SVC(kernel="linear", random_state=0) classifier.fit(X_train, y_train) arr2 = classifier.predict(X_test) arr2 = arr2.tolist() count1SVC = arr2.count(1) count_1SVC = arr2.count(-1) count_neuSVC = arr2.count(0) classifier = GaussianNB() classifier.fit(X_train, y_train) arr2 = classifier.predict(X_test) arr2 = arr2.tolist() count1NB = arr2.count(1) count_1NB = arr2.count(-1) count_neuNB = arr2.count(0) classifier = DecisionTreeClassifier() classifier.fit(X_train, y_train) arr2 = classifier.predict(X_test) arr2 = arr2.tolist() count1DTC = arr2.count(1) count_1DTC = arr2.count(-1) count_neuDTC = arr2.count(0) corpus = corpus[:-len(arr)] hist = pygal.Histogram() hist.add('Results', [(neg, 0, 10), (neu, 10, 20), (pos, 20, 30)]) graph_data = hist.render() return render_template("home.html", graph_data=graph_data, count1DTC=count1DTC, count_1DTC=count_1DTC, count_neuDTC=count_neuDTC, count1NB=count1NB, count_1NB=count_1NB, count_neuNB=count_neuNB, count1SVC=count1SVC, count_1SVC=count_1SVC, count_neuSVC=count_neuSVC, count1KNN=count1KNN, count_1KNN=count_1KNN, count_neuKNN=count_neuKNN) print("hello")
def __init__(self): self.chart = pygal.Histogram(title='Quotes by Rating', margin=20, show_legend=False, style=style)
from __future__ import division, print_function import numpy as np import pandas as pd import pygal as pg data = pd.read_csv('2016-first-quarter-citations.csv') data = data.dropna(how='any') ages = [int(x) for x in data['Cited Person Age'] if x < 100] age = list() min_age = min(ages) max_age = max(ages) print(min_age) print(max_age) for x in range(min_age, max_age): w = 0 for y in ages: if x == y: w = w + 1 t = w, x, x + 1 age.append(t) lc = pg.Histogram() lc.title = 'Age histogram' lc.add('age', age) lc.render_in_browser()