Example #1
0

ages = data['Cited Person Age']
min_age, max_age=np.min(ages),np.max(ages)
l=[]
c=[]
for i in range(min_age, max_age+1):
    l= ages[ages==i]
    h=len(l)
    c.append([h,i,i+1])

d=np.array(c)
range=int(np.max(ages)-np.min(ages))

# tuple(map(tuple, c))
e=[]
for j in c:
    e.append(tuple(j))

#hist=pygal.Histogram()
#hist.add('ages',e)
#hist.render()
fig = plt.figure()
#ax = fig.add_subplot(1, 1, 1)
#plt.hist(ages, bins=np.max(ages)-np.min(ages))
hist=pygal.Histogram()
hist.add('ages',e)
hist.render_to_file('histages2.svg')
#plt.show()

Example #2
0
def index():
    try:
        form = SearchForm()

        if form.validate_on_submit():
            
            tweets, text_for_cloud = get_tweets(form.searchword.data, form.number_of_results.data)

            # creating wordcloud and storing to static
            stop_words = ["https", "co", "RT"] + list(wordcloud.STOPWORDS)
            word_cloud = wordcloud.WordCloud(stopwords=stop_words, background_color="rgba(255, 255, 255, 0)", mode="RGBA")
            word_cloud.generate(text_for_cloud)
            # generating random numbers to the end of the image name to ensure that it is updated (not saved in cache)
            random_part = str(random.randint(1,101))
            path_to_img = f'static/word_cloud{random_part}.png'
            # before saving the file into the path specified, deleting possible old images (png format)
            # 1. Get a list of all the file paths that ends with .png from in specified directory
            fileList = glob.glob('static/word_cloud*.png')
            # 2. delete "old" png images
            for filePath in fileList:
                try:
                    os.remove(filePath)
                except:
                    print("Error while deleting file : ", filePath)
            word_cloud.to_file(path_to_img)

            # calculating sentiments for tweets and adding these to dictionaries, also adding counts to bin
            bin_counts = [0,0,0,0,0,0,0,0,0,0]
            for i in range(len(tweets)):
                tweets[i]['sentiment'] = classify(tweets[i]['tweet_text'])
                if tweets[i]['sentiment'] < 0.1:
                    bin_counts[0] += 1
                elif tweets[i]['sentiment'] < 0.2:
                    bin_counts[1] += 1
                elif tweets[i]['sentiment'] < 0.3:
                    bin_counts[2] += 1
                elif tweets[i]['sentiment'] < 0.4:
                    bin_counts[3] += 1
                elif tweets[i]['sentiment'] < 0.5:
                    bin_counts[4] += 1
                elif tweets[i]['sentiment'] < 0.6:
                    bin_counts[5] += 1
                elif tweets[i]['sentiment'] < 0.7:
                    bin_counts[6] += 1
                elif tweets[i]['sentiment'] < 0.8:
                    bin_counts[7] += 1
                elif tweets[i]['sentiment'] < 0.9:
                    bin_counts[8] += 1
                else:
                    bin_counts[9] += 1

            sorted_tweets = sorted(tweets, key = lambda i: i['sentiment'], reverse=True)

            # creating histogram
            from pygal.style import Style
            custom_style = Style(
            background='transparent',
            plot_background='transparent',
            colors=('#79bdd8','#79bdd8'))
            graph = pygal.Histogram(show_legend=False, title=u'Distribution of the sentiment predictions', 
                                    x_title='0 = most negative  1 = most positive', style=custom_style)
            graph.add('Narrow bars',  [(bin_counts[0], 0, 0.1), (bin_counts[1], 0.1, 0.2), (bin_counts[2], 0.2, 0.3),
                                        (bin_counts[3], 0.3, 0.4),(bin_counts[4], 0.4, 0.5),(bin_counts[5], 0.5, 0.6),
                                        (bin_counts[6], 0.6, 0.7),(bin_counts[7], 0.7, 0.8),(bin_counts[8], 0.8, 0.9),(bin_counts[9], 0.9, 1)])
            graph_data = graph.render_data_uri()
        
            return render_template('sort_by_sentiment_app.html', tweets=sorted_tweets, form=form, graph_data=graph_data, cloud_img_path=path_to_img)

        return render_template('sort_by_sentiment_app.html', form=form)
    except Exception as e:
        return(str(e))
Example #3
0
ages_freq_list = pd.DataFrame({
    'age': ages_freq.index,
    'count': ages_freq.values
})
ages_freq_list.sort_values(['age'], ascending=True, inplace=True)
ages_freq_list['bar'] = (ages_freq_list['age'] + 1).astype(int)

#Creating List of Tuples for Histogram plot
subset = ages_freq_list[['count', 'age', 'bar']]
tuples = [tuple(x) for x in subset.values]

#Histogram Plot
from pygal.style import BlueStyle, LightStyle, DarkGreenBlueStyle
hist = pygal.Histogram(legend_at_bottom=True,
                       human_readable=True,
                       title='first-quarter citations histogram',
                       x_title='Age',
                       y_title='No. of Violators')
hist.add('Narrow bars', tuples)
hist.render_to_file('histogram.svg')

#As we can see we see some 116 year-old violators!
#This is probably an error in the data, so we can remove these data points easily and plot the histogram again:

tuples = [i for i in tuples if i[1] < 100]

#Histogram Plot without outlier
from pygal.style import BlueStyle, LightStyle, DarkGreenBlueStyle
hist = pygal.Histogram(legend_at_bottom=True,
                       human_readable=True,
                       title='first-quarter citations histogram',
Example #4
0
i = LINE_ONE_ELEMS
for val in user0data:
    timesWaitedFreq[val] += 1

# calculate probabilities from freqs
for i in range(longestWait + 1):
    probOfTime[i] = (timesWaitedFreq[i] / len(user0data)) * MAKE_PERCENT

# load bar set for histo
histoBarSet = [(0, 0, 0)] * (len(user0data))
for i in range(len(probOfTime)):
    histoBarSet[i] = (probOfTime[i], i, i + 1)
    # print("height: ",probOfTime[i],". x(",i,",",i+1,").")

# make histo
pmf = pygal.Histogram(
    title=u'PMF | x-axis: time taken (ms), y-axis: probability (%)')
pmf.add('', histoBarSet)
pmf.render_to_file('q1.svg')

################################### (1) ii) ###################################

ones = 0
allConnections = 0
for i in range(len(timesWaitedFreq)):
    if (i <= 10):
        ones += timesWaitedFreq[i]
    allConnections += timesWaitedFreq[i]
probOfOne = (ones / allConnections) * MAKE_PERCENT
print("Probability(X = 1): ", probOfOne)

Example #5
0
def analyse(object_array, filename, profilesDict, logFile, outputDir):
    """Take the csv file and add each line as an ISOLATE object
    into object_array
    """
    from classex1 import ISOLATE
    from collections import defaultdict
    import pygal
    from math import log
    import os
    import numpy as np

    leftDict = {}
    for i in profilesDict:
        if len(profilesDict[i]) > 2:
            leftDict[profilesDict[i][0]] = [i, 'profile1']
            leftDict[profilesDict[i][1]] = [i, 'profile2']
        else:
            leftDict[profilesDict[i][0]] = [i, 'SINGLE']
    hmmerOutput = open(filename, 'r')

    E_counter = 0
    good_counter = 0
    total_counter = 0
    e_values = defaultdict(list)  # list of e_values
    counter = 0  # counter to miss first line
    #everything in table matched or un matched with e_value over 0.00005
    for item in hmmerOutput:
        counter += 1
        parts = item.split(",")
        total_counter += 1
        profileName = parts[0].replace('\n', '')
        if counter > 1 and profileName in leftDict.keys():
            e_values[profileName].append(-log(float(parts[11])))
        if counter > 1 and float(
                parts[11]) < float(0.00005) and profileName in leftDict.keys():
            good_counter += 1
            y = ISOLATE()
            y.found = []
            id_parts = parts[3].split('_')
            y.identifier = '_'.join(id_parts[:2])
            y.HNHstart = int(parts[17])
            y.HNHstop = int(parts[18])
            y.HNHcontig = parts[3]
            y.coltype = leftDict[profileName][0]
            y.found.append(parts[0].replace("\n", ""))
            y.contiglength = int(parts[5])
            y.MATCH = leftDict[profileName][1]
            y.found.append(parts[0].replace("\n", ""))
            object_array.append(y)
        else:
            E_counter += 1

    # 'Total number of hits: {}<br>Hits that did not pass the E counter threshold: {}<br> \
    #     Passed hits:  {}<br> Unaccounted for hits: {}'.format(total_counter,E_counter,
    #         good_counter,int(total_counter-(E_counter+good_counter))), logFile

    hmmerOutput.close()
    histDict = defaultdict(list)

    #Plotting the e-values as a histogram. Needs a bit of reformatting to deal
    #with pygals plotting format
    for profile in e_values:
        if len(e_values[profile]) < 100:
            binsN = len(e_values[profile])
        else:
            binsN = 100
        x = np.histogram(e_values[profile], bins=binsN)
        for i in range(len(x[0])):
            histDict[profile].append((x[0][i], x[1][i], x[1][i + 1]))
    hist = pygal.Histogram(stroke=False, x_title='E value (-log)')
    hist.title = 'E-value distribution'
    for profile in histDict.keys():
        hist.add(profile, histDict[profile])

    EvaluePath = os.path.join(outputDir, 'E_valueHist.svg')
    hist.render_to_file(EvaluePath)

    logFile.EvalueString(total_counter=total_counter,
                         E_counter=E_counter,
                         good_counter=good_counter,
                         Evaluepygal=EvaluePath)

    return object_array
Example #6
0
def makehist(x, df):
    hist = pygal.Histogram()
    hist.add(x, df[x])
    return hist.render_data_uri()
Example #7
0
import pandas as pd
import matplotlib.pyplot as plt
import pygal
data = pd.read_csv('2016-first-quarter-citations.csv')

#Dropping rows that has any missing values
data = data.dropna(how='any')
data.shape

#getting Age into a list
age=data['Cited Person Age']
#maximum age
max(age)

#minimum age
min(age)

#Creating the tuples (height,start location,end location) for the Pygal Histogram)
#From the 0 to the maximum value of Age (0,117)
agelist=[0]*120
for a in age:
    agelist[int(a)]+=1

agetuples=[]
for age in range(0,117):
    agetuples.append((agelist[age],age,age+1))

hist = pygal.Histogram(title=u'Histogram of Citations across Age - Q1 2016', x_title='Age',y_title="Total Citations",show_legend=False,tooltip_border_radius=10)
hist.add("Age Histogram", agetuples[:116])
hist.render_to_file('ageHistogram-corrected.svg')from __future__ import division, print_function
    19.4,  # 18
    14.6,  # 19
    10.4,  # 20
    6.9,  # 21
    4.2,  # 22
    2.1,  # 23
    0.7,  # 24    
)

probs_d20 = [100.0 - (x - 1) * 5 for x in range(1, 21)] + [0.0, 0.0, 0.0, 0.0]

bars_2d12 = []
bars_d20 = []
for i in range(1, 24):
    bars_2d12.append((probs_2d10[i - 1], i - 0.5, i + 0.5))
    bars_d20.append((probs_d20[i - 1], i - 0.5, i + 0.5))

# hist = pygal.Histogram(
hist = pygal.Histogram(
    human_readable=True,
    title="Probability of Getting At Least n on a Roll; 2d12 vs d20",
    x_title="n",
    y_title="Probability",
    legend_at_bottom=True,
    legend_at_bottom_columns=2,
)

hist.add('2d12', bars_2d12)
hist.add('d20', bars_d20)
hist.render_to_png('to_hit_2d12_vs_d20.png')
Example #9
0
from __future__ import division, print_function
import numpy as np
import pandas as pd
data = pd.read_csv('2016-first-quarter-citations.csv')
data = data.dropna(how='any')
data.shape
data['DateTime Issued'] = data.apply(lambda row: datetime.strptime(
    row['Date Issued'] + ':' + row['Time Issued'], '%m/%d/%y:%I:%M %p'),
                                     axis=1)
data['Day of Week Issued'] = data.apply(
    lambda row: datetime.strftime(row['DateTime Issued'], '%A'), axis=1)
ages = data['Cited Person Age']
bin_size = (np.max(ages) - np.min(ages)) / 100
age_list = []
for i in range(100):
    if i == 99:
        age_list.append(
            (len(ages[(ages >= i + 16) & (ages <= i + 17)]), i + 16, i + 17))
    else:
        age_list.append(
            (len(ages[(ages >= i + 16) & (ages < i + 17)]), i + 16, i + 17))

age_list
import pygal as pg
hist = pg.Histogram()
hist.add('Age Histogram', age_list)
hist.render_to_file('hist_chart.svg')
Example #10
0
def graph2():
    hist = py.Histogram()
    hist.add('Wide bars', [(5, 0, 10), (4, 5, 13), (2, 0, 15)])
    hist.add('Narrow bars', [(10, 1, 2), (12, 4, 4.5), (8, 11, 13)])
    return Response(response=hist.render(), content_type='image/svg+xml')
Example #11
0
def hello():
    hashtag = request.form.get('hashtag')
    print(hashtag)
    global corpus

    # In[25]:

    # http://stackoverflow.com/a/13752628/6762004
    RE_EMOJI = re.compile('[\U00010000-\U0010ffff]', flags=re.UNICODE)

    def strip_emoji(text):
        return RE_EMOJI.sub(r'', text)

    # In[41]:

    print("getting the tweets")
    arr = []
    counter = 0
    ####input your credentials here
    consumer_key = 'Qv0Kw5qmwlZAqk93p6R2OFI2X'
    consumer_secret = 'uAfuPdO4yCMQ48rlHXznXjjAGIZskHlELInqKtX5dZaQ0AJZPI'
    access_token = '1089453692137455616-6PHQDg2q3Dk6MPOD8PnsUqRTOaQywS'
    access_token_secret = 'jmOnoyjzjyZ0dUqycPcqVBeHqiFRlZNZ6pqMQKVMiCNUl'

    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_token_secret)
    api = tweepy.API(auth, wait_on_rate_limit=True)
    #####United Airlines
    # Open/Create a file to append data
    # csvFile = open('ua3.csv', 'a')
    #Use csv Writer
    # csvWriter = csv.writer(csvFile)

    for tweet in tweepy.Cursor(api.search,
                               q='#' + hashtag,
                               count=100,
                               lang="hi",
                               since="2017-04-03").items():
        print(tweet.created_at, tweet.text)
        counter = counter + 1
        if (counter == 100):
            break
        arr.append(translator.translate(strip_emoji(tweet.text)).text)
    #     csvWriter.writerow([tweet.created_at, translator.translate(strip_emoji(tweet.text)).text.encode("utf-8")])

    # In[42]:

    for i in range(0, len(arr)):
        review = re.sub("[^a-zA-Z]", " ", arr[i])
        review = review.lower()
        review = review.split()
        a = []
        for word in review:
            if (word in stopwords.words("english")):
                m = 1
            else:
                s = ps.stem(word)
                a.append(s)
        review = a
        review = ' '.join(review)
        corpus.append(review)

    # In[43]:

    cv = CountVectorizer(max_features=1500)
    x = cv.fit_transform(corpus).toarray()
    y = dataset.iloc[:, 1].values
    # X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.30, random_state = 0)
    X_train = x[:-len(arr)]
    X_test = x[-len(arr):]
    y_train = y
    # Fitting Naive Bayes to the Training set
    classifier = RandomForestClassifier()
    classifier.fit(X_train, y_train)

    # Predicting the Test set results
    # y_pred = classifier.predict(X_test)

    # # Making the Confusion Matrix
    # from sklearn.metrics import confusion_matrix
    # cm = confusion_matrix(y_test, y_pred)
    arr2 = classifier.predict(X_test)
    # print(arr2)

    # In[44]:

    arr2 = arr2.tolist()

    # In[45]:

    count1 = arr2.count(1)
    count_1 = arr2.count(-1)
    count_neu = arr2.count(0)

    # In[46]:

    print("pos=" + str(count1) + "  neg=" + str(count_1) + "  neu=" +
          str(count_neu))

    # In[47]:

    neg = count_1
    pos = count1
    neu = count_neu

    # In[48]:
    classifier = KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski')
    classifier.fit(X_train, y_train)
    arr2 = classifier.predict(X_test)
    arr2 = arr2.tolist()
    count1KNN = arr2.count(1)
    count_1KNN = arr2.count(-1)
    count_neuKNN = arr2.count(0)

    classifier = SVC(kernel="linear", random_state=0)
    classifier.fit(X_train, y_train)
    arr2 = classifier.predict(X_test)
    arr2 = arr2.tolist()
    count1SVC = arr2.count(1)
    count_1SVC = arr2.count(-1)
    count_neuSVC = arr2.count(0)
    classifier = GaussianNB()
    classifier.fit(X_train, y_train)
    arr2 = classifier.predict(X_test)
    arr2 = arr2.tolist()
    count1NB = arr2.count(1)
    count_1NB = arr2.count(-1)
    count_neuNB = arr2.count(0)
    classifier = DecisionTreeClassifier()
    classifier.fit(X_train, y_train)
    arr2 = classifier.predict(X_test)
    arr2 = arr2.tolist()
    count1DTC = arr2.count(1)
    count_1DTC = arr2.count(-1)
    count_neuDTC = arr2.count(0)

    corpus = corpus[:-len(arr)]
    hist = pygal.Histogram()
    hist.add('Results', [(neg, 0, 10), (neu, 10, 20), (pos, 20, 30)])
    graph_data = hist.render()

    return render_template("home.html",
                           graph_data=graph_data,
                           count1DTC=count1DTC,
                           count_1DTC=count_1DTC,
                           count_neuDTC=count_neuDTC,
                           count1NB=count1NB,
                           count_1NB=count_1NB,
                           count_neuNB=count_neuNB,
                           count1SVC=count1SVC,
                           count_1SVC=count_1SVC,
                           count_neuSVC=count_neuSVC,
                           count1KNN=count1KNN,
                           count_1KNN=count_1KNN,
                           count_neuKNN=count_neuKNN)

    print("hello")
Example #12
0
 def __init__(self):
     self.chart = pygal.Histogram(title='Quotes by Rating',
                                  margin=20,
                                  show_legend=False,
                                  style=style)
from __future__ import division, print_function
import numpy as np
import pandas as pd
import pygal as pg

data = pd.read_csv('2016-first-quarter-citations.csv')
data = data.dropna(how='any')
ages = [int(x) for x in data['Cited Person Age'] if x < 100]
age = list()
min_age = min(ages)
max_age = max(ages)
print(min_age)
print(max_age)
for x in range(min_age, max_age):
    w = 0
    for y in ages:
        if x == y:
            w = w + 1
    t = w, x, x + 1
    age.append(t)
lc = pg.Histogram()
lc.title = 'Age histogram'
lc.add('age', age)
lc.render_in_browser()