Example #1
0
 def SubmitFile(selection):
     fName = fileName
     if fName is '':
         messagebox.showerror('ERROR', 'No file loaded!')
         return
     if selection is '1':
         st = dB.LoadDataInsert(fName)
         if st is 'Success':
             messagebox.showinfo('Success', 'You successfully imported ' + os.path.basename(fName))
         else:
             messagebox.showerror(st, os.path.basename(fName) + ' was NOT successfully imported!')
     elif selection is '2':
         st = dB.SingleInsert(fName)
         if st is 'Success':
             messagebox.showinfo('Success', 'You successfully imported ' + os.path.basename(fName))
         else:
             messagebox.showerror(st, os.path.basename(fName) + ' was NOT successfully imported!')
     elif selection is '3':
         st = dB.MultiRowInsert(fName)
         if st is 'Success':
             messagebox.showinfo('Success', 'You successfully imported ' + os.path.basename(fName))
         else:
             messagebox.showerror(st, os.path.basename(fName) + ' was NOT successfully imported!')
     else:
         messagebox.showerror('Error', 'No method chosen!')
         return
def PrintTagCloud(seg, terms, td_scores):
    from os import path
    from wordcloud import WordCloud

    d = path.dirname(__file__)

    # Tuple like follows (term, frequency)
    frequencies = []
    # Gera a tag cloud da lista de tuplas das palavras e suas frequencias
    for i in range(len(terms)):
        frequencies += [
            (terms[i], td_scores[i]),
        ]
    wordcloud = WordCloud().generate_from_frequencies(frequencies)

    import matplotlib.pyplot as plt
    plt.imshow(wordcloud)
    plt.axis('off')

    # lower max_font_size
    label = dbm.GetAccountLabel(seg)
    wordcloud = WordCloud(
        max_font_size=40).generate_from_frequencies(frequencies)
    plt.title(label)

    plt.savefig('../analytics/%s/%s_tagcloud.png' % (label, label))
    plt.close()
Example #3
0
    def ComputeAverage():
        tableName = str(comboBox.get().lower())
        colName = str(comboBox2.get())

        if tableName is not '' and colName is not '':
            label.config(text=dB.average(tableName, colName))
        else:
            messagebox.showerror('Error', 'No value can be NULL!')
Example #4
0
def analyze_from_to_piecewise(fromDate, toDate):
    saturdays = dfc.get_saturday_list(fromDate, toDate)
    base = 'data_by_date/dbscan_analysis_{}.csv'

    for saturday in saturdays:
        data = dfc.get_data_from_date(saturday)
        analyzedData = ag.analyze(data, 1300, 3)
        analyzedData.to_csv(base.format(saturday))

    allData = [pd.read_csv(base.format(s)) for s in saturdays]
    concatData = pd.concat(allData)
    concatData.sort_values(by='DATE', inplace=True)

    rides_over_time = create_rides_over_time_csv(concatData)
    dbm.connect_execute_rides_over_time(rides_over_time)
    concatData.to_csv(
        'data_by_date/000_total_dbscan_analysis_{}_to_{}.csv'.format(
            saturdays[0], saturdays[-1]))
Example #5
0
    def MessageBoxShow(selection):
        if selection is 'all':
            dB.delete('games')
            dB.delete('play')
            dB.delete('players')
            dB.delete('teams')
            return

        st = dB.delete(selection)
        if st is not 'Success':
            messagebox.showerror(
                st,
                'The deletion of ' + selection + ' was NOT successful!\n' + st)
        else:
            messagebox.showinfo(
                st, 'The deletion of ' + selection + ' was successful!')
def RetriveContent(documents_id):
    docs_content = []
    for document_id in documents_id:
        try:
            tweet_preprocessed = dbm.GetTweetById(document_id[0])
            docs_content += [
                tweet_preprocessed.split(' '),
            ]
        except (AttributeError):
            next

    return docs_content
def PrintBubbleChart(seg, cluster_info):
    from pylab import *
    from scipy import *
    import math

    x = []
    y = []
    color = []
    area = []

    for key in cluster_info.keys():
        # Number of people inside the cluster
        x.append(cluster_info[key][1])

        # Number of tweets in cluster occurs in database
        y.append(cluster_info[key][0])

        # Proporcional area to the number of people inside the cluster
        k = cluster_info[key][1]
        area.append(math.pi * (k)**2)

        # Color corresponds to the number of occurences of terms in database
        color.append(cluster_info[key][1])

        text(cluster_info[key][1],
             cluster_info[key][0],
             cluster_info[key][2],
             size=7,
             horizontalalignment='center')

    sct = scatter(x, y, c=color, s=area, linewidths=2, edgecolor='w')
    sct.set_alpha(1)

    label = dbm.GetAccountLabel(seg)

    xmax = max([x[1] for x in cluster_info.values()])
    ymax = max([y[0] for y in cluster_info.values()])

    axis([0, xmax + (1 / float(xmax)), 0, ymax + (1 / float(ymax))])

    title('%s' % label)
    xlabel('Numero de follower dentro do cluster', fontsize=16)

    # This is the sum of occurrences in the database from all
    # terms inside the cluster
    ylabel('''Numero de tweets dentro do cluster''', fontsize=16)

    savefig('../analytics/%s/%s_bubblechart.png' % (label, label))
    close()
def DrawDendrogram(arg, labels, seguradora):
    # Calculate full dendrogram
    label = dbm.GetAccountLabel(seguradora)

    plt.title('Dendrograma de %s', label)
    plt.ylabel('Documentos', fontsize=16)
    plt.xlabel('Dissimilaridade', fontsize=16)
    dendrogram(
        arg,
        leaf_rotation=0.,  # rotates the x axis labels
        leaf_font_size=8.,  # font size for the x axis labels
        labels=[' '.join(e) for e in labels],
        orientation='left')

    plt.savefig('../analytics/%s/%s_hierarchical_cluster.png' % (label, label))
    plt.close()
def Draw2DClusters(arg, seguradora):
    # >arg< is the dissimilarity matrix
    mds = MDS(n_components = 2,\
              dissimilarity = 'precomputed',\
              random_state = 1)
    pos = mds.fit_transform(arg)


    plt.scatter(pos[:, 0],\
                pos[:, 1],\
                c = clusters_colors)

    plt.title('Documentos e seus clusters')

    label = dbm.GetAccountLabel(seguradora)
    plt.savefig('../analytics/%s/%s_partitional_cluster.png' % (label, label))
    plt.close()
def Silhouette(X, seguradora):
    insurance_label = dbm.GetAccountLabel(seguradora)
    maxx = len(X)

    if maxx > 11:
        maxx = 11

    range_of_clusters = list(range(2, maxx))
    clusters_silhouette = dict()

    for n_clusters in range_of_clusters:
        # Initialize the clusterer with n_clusters value
        #...and a random generator
        # seed of 10 for reproducibility.
        clusterer = SKMeans(n_clusters=n_clusters, random_state=0)
        cluster_labels = clusterer.fit_predict(X)

        # The silhouette_score gives the average value for all the samples.
        # This gives a perspective into the density and separation
        #...of the formed
        # clusters
        silhouette_avg = silhouette_score(X, cluster_labels)

        clusters_silhouette.update({n_clusters: silhouette_avg})

        # Compute the silhouette scores for each sample
        sample_silhouette_values = silhouette_samples(X, cluster_labels)

    plt.title('Silhueta media de %s' % insurance_label)
    plt.xlabel('Numero de clusters', fontsize=16)
    plt.ylabel("Silhueta media", fontsize=16)
    plt.plot(clusters_silhouette.keys(), clusters_silhouette.values())
    plt.savefig("../analytics/%s/%s_silhuette.png" \
        % (insurance_label, insurance_label))
    plt.close()

    silhouettes = [v for v in clusters_silhouette.values()]

    for k, v in clusters_silhouette.iteritems():
        if max(silhouettes) == v:
            return k
def extraCluster(clusters, terms_clusterized, seg, collection, documents_id):
    # Creates structure in format:
    # {cluster_index: [frequency of terms in database, people in this cluster]}
    cluster_info = dict()

    documents_id = [_[0] for _ in documents_id]

    label = dbm.GetAccountLabel(seg)

    cluster_weightered = []
    for k, v in clusters.iteritems():
        cluster_weightered += [
            (k, len(v)),
        ]

    # Lists the 25 biggest clusters (with more tweets inside)
    for _ in sorted(cluster_weightered, key = lambda weight: weight[1],\
        reverse = True)[:25]:

        print '\tO cluster %d tem %d tweet(s)' % (_[0], _[1])
        print '''\t\tEsses tweet(s) tem %d termo(s), que sao os seguintes:'''\
            % (len(terms_clusterized[_[0]].keys()))

        cluster_info.update({_[0]: [
                                 _[1],
                             ]})

        intraCluster(terms_clusterized[_[0]], seg, collection, cluster_info,
                     _[0], documents_id)

        print 'cluster info ', cluster_info

    PrintBubbleChart(seg, cluster_info)
    ###
    try:
        import math
        PrintTagCloud(seg, [v[2] for v in cluster_info.values()],
                      [math.log10(v[0] * v[1]) for v in cluster_info.values()])
    except (TypeError):
        print 'algum erro em ', cluster_info.values()
def main():
    try:
        name = raw_input('Account name: ')
        search_account = search_user(name)
        print 'Resuts:'
        for i, v in enumerate(search_account):
            print '%d | %s' % (i + 1, v['name'])

        option = input('Choose an account (1-10):')
        user = search_account[option - 1]
        seguradora_to_file(user['id'])

        # Outputs to log file
        logging.info('Getting %s information' %
                     dbm.GetAccountLabel(user['id']))
        return user
    except twitter.api.TwitterHTTPError as e:
        if str(e).find('401'):
            m = json.dumps({'code': 401, \
                'message': 'Unauthorized', 'account': user['id']}), \
                    time.asctime(time.localtime(time.time()))
            print m

            # Outputs to log file
            logging.info(m)

        elif str(e).find('429'):
            m = json.dumps({'code': 429, \
                'message': 'Rate limit exceeded'}), \
                    time.asctime(time.localtime(time.time()))
            print m

            # Outputs to log file
            logging.info(m)
        print 'Waiting API time'
        time.sleep(900)
        return user
def LoadFromDataFrame(seg):
    label = dbm.GetAccountLabel(seg)

    df = pd.read_csv('../analytics/%s/%s_sliced.csv' % (label, label))
    df = df.dropna(axis='columns', how='all')
    #df = df.drop('Unnamed: 0', axis = 1)

    M = pd.DataFrame.as_matrix(df)

    terms = [term for term in M.T[0]]
    #d0 =  [df['Unnamed: 0.1'][r] for r in df.index]

    #d1 = [[c] for c in df.columns[1:]]
    doc_ids = [[c] for c in df.columns[1:]]

    scores = [row for row in M.T[2:]]

    del M
    del df
    # The content returned is a list of three lists, the first list is a list
    # of terms, the second is a document id list and the third is a score list
    # (where which score list corresponds to the scores of terms in each
    # document)
    return [terms, doc_ids, scores]
def PrintVennDiagram(A, B, C):
    # Visualization of the Venn diagram of the 3 biggest insurance-companies
    from matplotlib_venn import venn3

    # Given the insurance-company id. search it followers in database
    setA = dbm.GetFollowerBySeg(A)
    setB = dbm.GetFollowerBySeg(B)
    setC = dbm.GetFollowerBySeg(C)

    labelA = dbm.GetAccountLabel(A)
    labelB = dbm.GetAccountLabel(B)
    labelC = dbm.GetAccountLabel(C)

    set1 = set(setA)
    set2 = set(setB)
    set3 = set(setC)

    venn3([set1, set2, set3], (labelA, labelB, labelC))
    plt.title(
        'Diagrama de Venn das 3 maiores seguradoras em numero de follower')
    plt.savefig('../analytics/venn_diagram.png')
    plt.close()
Example #15
0
tokens_re = re.compile(r'(' + '|'.join(regex_str) + ')',
                       re.VERBOSE | re.IGNORECASE)

# Applies regex to stop-words list
stop_words = [
    ClearTweet(str(word.encode(encoding='UTF-8', errors='strict')))
    for word in NL.corpus.stopwords.words('portuguese')
] + ['pra']

# Start
time_begin = time.asctime(time.localtime(time.time()))

# Collection with all insurance-companies and it's followers
# >U< is for Universe, which is a dict which keys are insurance-companies
# and the values are the followers
U = dbm.GetUniverse()

# Creates log file
logging.basicConfig(filename = 'preprocessing_results.log', \
    level = logging.DEBUG)

for insurance_id in U.keys():
    # >root_derivations< is a dict which keys are root tokens (stem) and
    # values are derivations from that stem. i.e, {r, [w1, w2, ..., wK]}
    # Where r ε {stem(w) | w belongs to >basket_of_terms<}
    # and w ε >possible_roots<. i.e. {'work': ['worked', 'working']}
    R = dict()
    basket = []

    # Dict used to create the .csv file
    # Keys are tweet ids and values are the whole
def RetrieveFollowers(argLst):
    return list(set([dbm.GetFollowerByTweetId(_) for _ in argLst]))
Example #17
0
 def PrintResults():
     result.config(state=tk.NORMAL)
     result.delete(1.0, tk.END)
     st = dB.retrieve(comboBox.get().lower())
     result.insert(tk.END, st)
     result.config(state=tk.DISABLED)
def intraCluster(cluster, seg, collection, cluster_info,\
                 cluster_index, documents_id):

    tweets = []

    # Realizes the sorting of terms by quantity of occurences inside a cluster
    items_sorted =  sorted([_ for _ in cluster.iteritems()],\
                                key = lambda freq: freq[1],\
                                reverse = True)

    tweetsInCluster = []

    # Run through each cluster term which is sorted decrescently
    for item in items_sorted:
        try:
            # List with all tweets which contains the term
            tweets = [
                _ for _ in dbm.GetTweetIdByTerm(item[0], seg)
                if str(_) in documents_id
            ]

            # List with the derivations of stemmed term
            variations = dbm.GetDerivatives(item[0])

            # The w term appears x times in this cluster
            print '''\t\t\t[%s] aparece %d vez(es) nesse cluster.
                \t\tEsse termo pode ser: %s''' % (item[0], item[1], variations)

            if seg in collection.keys():
                collection[seg] += [
                    item[0],
                ]
            else:
                collection[seg] = [
                    item[0],
                ]

        except (Exception):
            next

        tweetsInCluster += tweets

    followers = dbm.GetFollowerByAccount(list(set(tweetsInCluster)), seg)
    followers = set(followers) - set([seg])


    print '''\t\tExistem %d seguidores de %s que usam os termos nesse cluster.'''\
        % (len(followers), dbm.GetAccountLabel(seg))

    print '\t\tEsses seguidores sao os seguintes ', list(followers)

    get_der = dbm.GetDerivatives(items_sorted[0][0])

    possibilities = items_sorted[0][0] if get_der == None else get_der

    valid = True
    for v in cluster_info.values():
        if possibilities in v:
            valid = False

    if valid:
        cluster_info[cluster_index] += [len(followers), possibilities]
    else:
        cluster_info.pop(cluster_index)
tokens_re = re.compile(r'(' + '|'.join(regex_str) + ')',
                       re.VERBOSE | re.IGNORECASE)

# Applies regex to stop-words list
stop_words = [
    ClearTweet(str(word.encode(encoding='UTF-8', errors='strict')))
    for word in NL.corpus.stopwords.words('portuguese')
] + ['pra']

# Start
time_begin = time.asctime(time.localtime(time.time()))

# Collection with all insurance-companies and it's followers
# >U< is for Universe, which is a dict which keys are insurance-companies
# and the values are the followers
U = dbm.GetUniverse()

# Creates log file
logging.basicConfig(filename = 'preprocessing_results.log', \
    level = logging.DEBUG)

for insurance_id in U.keys():
    label = dbm.GetAccountLabel(insurance_id)

    if os.path.isfile('../analytics/%s/%s.csv' % (label, label)):
        print '%s already preprocessed' % label
        continue

    print "Getting followers feed from %s" % label

    # >followers_feed< is a dict data type where keys are followers
Example #20
0
def get_rides_over_time(data):
    rides_over_time = dfc.find_total_rides_per_day(data)
    dbm.connect_execute_rides_over_time(rides_over_time)
# Internal imports
import sys
sys.path.insert(0, '../lib')
import DatabaseMethods as dbm

# External imports
import json
import folium
import geocoder

seguradoras = dbm.GetAllSeguradoras()

for seg in seguradoras:
    label = dbm.GetAccountLabel(seg)

    print 'Account: %s' % label

    latitudes = []
    longitudes = []
    user_names = []

    # Lista com tuplas de (usuario, localizacao)
    locations = dbm.GetLocations(seg)

    # Aplica expressao regular para limpar a localizacao
    for i, _ in enumerate(locations):
        print '%d of %d' \
            % (i+1, len(locations))
        try:
            g = geocoder.google(_[1])
def insertFollowerFromFile(id_seguradora, followers, followers_count,\
    followers_protected, followers_noTweets, followers_in, control):

    print 'insertFollowerFromFile()'

    # Reads from database the whole insurance's companies followers
    # previously inserted
    already_in = [_[0] for _ in dbm.GetFollowerBySeg(id_seguradora)]

    # Removes from followers to be inserted
    # those, who belongs to this insurance company, already inserted
    followers = [_ for _ in followers if _ not in already_in]

    # Marks the position of the present follower
    # Indicates each index of a list of followers
    k = 0

    # If true, then there are followers to be inserted yet
    # If false, there are none
    follower_left = True

    current_user = dict()

    for item in followers:
        if item == -1:
            # >control< show if all the followers were correctly inserted
            control += 1

    if control >= len(followers):
        follower_left = False

    # Recursion stop criterion
    if follower_left:
        for follower in followers:
            # If >follower == -1<, then follower already inserted
            # Goes to >else<
            if follower != -1:
                if available(follower):
                    time.sleep(1)
                    try:
                        user_timeline = \
                            get_tweets_current_user(follower, 200, 0)

                        if len(user_timeline) != 0:
                            current_user = get_user(follower)

                            insert(current_user, user_timeline, 0,\
                                id_seguradora)

                            followers_in = followers_in + 1

                            m = json.dumps({'code': 200, \
                                'message': 'Sucess!', \
                                'account': follower}), \
                                    time.asctime(time.localtime(time.time()))

                            print m
                            logging.info(m)

                        else:
                            followers_noTweets = followers_noTweets + 1
                            followers_count -= 1
                            control += 1
                            m = json.dumps({'code': 304, \
                                'message': 'No tweets available', \
                                'account': follower}), \
                                    time.asctime(time.localtime(time.time()))
                            print m
                            logging.info(m)

                        # If follower correctly inserted then it's value
                        # in it's position in the list is replaced by -1
                        followers[k] = -1
                    except Exception as e:
                        # Check if returned content is NoneType
                        # which means this follower is protected
                        if str(e).find('NoneType'):
                            followers_protected += 1
                            followers[k] = -1
                            followers_count -= 1
                            control += 1
                        next
                else:
                    # If current follower from the current insurance company
                    # had already been inserted, then it only inserts the
                    # user-insurance relantionship.
                    # And ignores the whole follower info
                    InsertInsuranceFollower(follower, id_seguradora)
                    followers_in = followers_in + 1
                    followers[k] = -1
                    next
            else:
                print k, ' jumped'
            k += 1
        # Uses recurvise call of this function, in a way that
        # >followers< is passed with those followers
        # who still need to be inserted
        insertFollowerFromFile(id_seguradora, followers,\
            followers_count, followers_protected,\
                followers_noTweets, followers_in, control)
    else:
        m = json.dumps({'code':200, \
            'message': 'Finished!', \
            'users': {
                'available': followers_count,
                'inserted': followers_in,
                'no-tweet': followers_noTweets,
                'protected': followers_protected}})

        # Outputs to log file
        logging.info(m)

        raise Exception(m)
def PlotDataFromFile(insurance_id):

    import matplotlib.pyplot as plt

    insurance_label = dbm.GetAccountLabel(insurance_id)

    # Reads the dataframe
    df = pd.read_csv('%s/%s.csv' % (insurance_label, insurance_label))

    # Drops all NaNs columns
    df = df.dropna(axis='columns', how='all')

    # Calculates the occurence frequencies of the term in the documents
    terms = []
    terms_freq = []
    for row in df.index:
        occurences = 0

        for column in df.columns[1:]:
            occurences += df[column][row]

        terms.append(df['Unnamed: 0'][row])
        terms_freq.append(occurences)

    # Pairs term and it's frequencies
    zipped = zip(terms, terms_freq)

    # Sorts decrescently terms with it's frequencies
    zipped_sorted = sorted(zipped, key=lambda t: t[1], reverse=True)

    # Data array
    data = np.array([_[1] for _ in zipped_sorted])
    data_labels = [_[0] for _ in zipped_sorted]

    # Finds the quartiles and median
    q1, median, q3 = np.percentile(data, [25, 50, 75])

    print '1st Quartile', q1
    print 'Median ', median
    print '3rd Quartile', q3

    # Draws the bars chart. Term per occurence
    plt.figure(1)
    plt.bar(np.arange(len(data_labels)), data, align='center', alpha=0.5)

    plt.xlabel('Terms')
    plt.ylabel('Occurences')
    plt.title('Occurrence of terms in documents of %s' % insurance_label)
    plt.savefig('%s/%s_ocorrenceTerms.png'\
        % (insurance_label, insurance_label))
    plt.close()

    # Desenha o boxplot
    # Draws the boxplot
    plt.figure(2)
    plt.title('Boxplot of %s' % insurance_label)
    bp = plt.boxplot(data)
    plt.savefig('%s/%s_boxplot.png'\
        % (insurance_label, insurance_label))
    plt.close()

    # Draws the violin
    plt.figure(3)
    plt.title('Density and occurrence of terms in docs from %s' %
              insurance_label)
    plt.xlabel('Density')
    plt.ylabel('Occurrences')
    plt.violinplot(data, showmeans=False, showmedians=True)
    plt.savefig('%s/%s_violinplot.png' % (insurance_label, insurance_label))

    CutFile(q3, zipped, df, insurance_label)

    plt.close()
    del df
    for c in df.columns:
        if c != 'Unnamed: 0':
            acc = 0.
            for r in df.index:
                acc += df[c][r]
            if acc == 0.:
                df = df.drop(c, axis=1)
                print c, ' removed'

    df.to_csv('%s/%s_sliced.csv'\
        % (label, label))

    #insert in database here


general_insurances = dbm.GetAllSeguradoras()

for insurance_id in general_insurances:
    if insurance_id in [1202130601]:  # correct: not in
        try:
            print 'Sliced %s started' % dbm.GetAccountLabel(insurance_id)
            PlotDataFromFile(insurance_id)
            print "\t\tSlice done"
        except (IOError):
            next
        except (KeyError), e:
            with open("message.err", "w") as arq:
                arq.write("\n[KeyError slicing the file]\n")
            print "\t\tError reducing dimensionality"
            print e
    else:
Example #25
0
            next

    del M
    
    column_names = [df.columns[index] for index in column_to_remove]
    for i, name in enumerate(column_names):
        try:
            df = df.drop(str(name), axis = 1)
            print name, ' deleted @ ', i
        except(IndexError):
            print 'erro em ', name
    
    df.to_csv('%s/%s_sliced.csv' % (label, label))  
    del df

general_insurances = dbm.GetAllSeguradoras()

for insurance_id in general_insurances:
    try:
        label = dbm.GetAccountLabel(insurance_id)
        if os.path.isfile('%s/%s_sliced.csv' % (label, label)):
            print '%s already sliced' % label
            next
        else:
            print 'Sliced %s started' % label
            PlotDataFromFile(insurance_id)
            print "Slice done"
    except(IOError):
        next
    except(KeyError), e:
        with open("message.err", "w") as arq:
Example #26
0
def PlotDataFromFile(insurance_id):
    
    import matplotlib.pyplot as plt
    #
    import matplotlib
    matplotlib.rcParams.update({'font.size': 24})
    #
    insurance_label = dbm.GetAccountLabel(insurance_id)
    
    # Reads the dataframe
    df = pd.read_csv('%s/%s.csv' % (insurance_label, insurance_label))
    
    # Drops all NaNs columns
    df = df.dropna(axis = 'columns', how = 'all')

    # Calculates the occurence frequencies of the term in the documents
    M = pd.DataFrame.as_matrix(df)
    
    terms = []
    terms_freq = []
    
    for row in M:
        terms.append(row[0])
        terms_freq.append(sum(list(row[1:])))
        
    del M
    
    # Pairs term and it's frequencies
    zipped = zip(terms, terms_freq)
    
    # Sorts decrescently terms with it's frequencies
    zipped_sorted = sorted(zipped, key = lambda t: t[1],
                           reverse = True)
    
    # Data array
    data = np.array([_[1] for _ in zipped_sorted])
    data_labels = [_[0] for _ in zipped_sorted]
    
    # Finds the quartiles and median
    q1, median, q3 = np.percentile(data, [25, 50, 75])
    
    print '1st Quartile', q1
    print 'Median ',  median
    print '3rd Quartile', q3
    
    # Draws the bars chart. Term per occurence
    plt.figure(1)
    plt.bar(np.arange(len(data_labels)),
            data,
            align = 'center',
            alpha = 0.5)
    
    plt.xlabel('Termos', fontsize=16)
    plt.ylabel('Ocorrencias', fontsize=16)
    plt.title('Documentos de %s' % insurance_label)
    plt.savefig('%s/%s_ocorrenceTerms.png' % (insurance_label, insurance_label))
    plt.close()
    
    # Desenha o boxplot
    plt.figure(2)
    plt.title('Boxplot de %s'
              % insurance_label)
    bp = plt.boxplot(data)
    plt.savefig('%s/%s_boxplot.png'\
        % (insurance_label, insurance_label))
    plt.ylabel('Ocorrencias', fontsize=16)
    plt.close()
    
    # Draws the violin
    plt.figure(3)
    plt.title('Termos de %s'
              % insurance_label)
    plt.xlabel('Densidade', fontsize=16)
    plt.ylabel('Ocorrencias', fontsize=16)
    plt.violinplot(data, 
                   showmeans = False,
                   showmedians = True)
    plt.savefig('%s/%s_violinplot.png'
                % (insurance_label, insurance_label))
    
    CutFile(q3, zipped, df, insurance_label)
    
    plt.close()
    del df
    set1 = set(setA)
    set2 = set(setB)
    set3 = set(setC)

    venn3([set1, set2, set3], (labelA, labelB, labelC))
    plt.title(
        'Diagrama de Venn das 3 maiores seguradoras em numero de follower')
    plt.savefig('../analytics/venn_diagram.png')
    plt.close()


# Creates log file
logging.basicConfig(filename = 'posprocessing_outputs.log',\
                    level = logging.DEBUG)

seguradoras = dbm.GetAllSeguradoras()
collection = dict()

followers_count = [(i, len(dbm.GetFollowerBySeg(_)))\
                    for i, _ in enumerate(seguradoras)]
foo = sorted(followers_count, key=lambda x: x[1], reverse=True)

try:
    PrintVennDiagram(seguradoras[foo[0][0]], seguradoras[foo[1][0]],\
                    seguradoras[foo[2][0]])
except (IndexError):
    print 'Contas insuficientes para gerar diagrama de Venn'

for seguradora in seguradoras:
    try:
        label = dbm.GetAccountLabel(seguradora)