def SubmitFile(selection): fName = fileName if fName is '': messagebox.showerror('ERROR', 'No file loaded!') return if selection is '1': st = dB.LoadDataInsert(fName) if st is 'Success': messagebox.showinfo('Success', 'You successfully imported ' + os.path.basename(fName)) else: messagebox.showerror(st, os.path.basename(fName) + ' was NOT successfully imported!') elif selection is '2': st = dB.SingleInsert(fName) if st is 'Success': messagebox.showinfo('Success', 'You successfully imported ' + os.path.basename(fName)) else: messagebox.showerror(st, os.path.basename(fName) + ' was NOT successfully imported!') elif selection is '3': st = dB.MultiRowInsert(fName) if st is 'Success': messagebox.showinfo('Success', 'You successfully imported ' + os.path.basename(fName)) else: messagebox.showerror(st, os.path.basename(fName) + ' was NOT successfully imported!') else: messagebox.showerror('Error', 'No method chosen!') return
def PrintTagCloud(seg, terms, td_scores): from os import path from wordcloud import WordCloud d = path.dirname(__file__) # Tuple like follows (term, frequency) frequencies = [] # Gera a tag cloud da lista de tuplas das palavras e suas frequencias for i in range(len(terms)): frequencies += [ (terms[i], td_scores[i]), ] wordcloud = WordCloud().generate_from_frequencies(frequencies) import matplotlib.pyplot as plt plt.imshow(wordcloud) plt.axis('off') # lower max_font_size label = dbm.GetAccountLabel(seg) wordcloud = WordCloud( max_font_size=40).generate_from_frequencies(frequencies) plt.title(label) plt.savefig('../analytics/%s/%s_tagcloud.png' % (label, label)) plt.close()
def ComputeAverage(): tableName = str(comboBox.get().lower()) colName = str(comboBox2.get()) if tableName is not '' and colName is not '': label.config(text=dB.average(tableName, colName)) else: messagebox.showerror('Error', 'No value can be NULL!')
def analyze_from_to_piecewise(fromDate, toDate): saturdays = dfc.get_saturday_list(fromDate, toDate) base = 'data_by_date/dbscan_analysis_{}.csv' for saturday in saturdays: data = dfc.get_data_from_date(saturday) analyzedData = ag.analyze(data, 1300, 3) analyzedData.to_csv(base.format(saturday)) allData = [pd.read_csv(base.format(s)) for s in saturdays] concatData = pd.concat(allData) concatData.sort_values(by='DATE', inplace=True) rides_over_time = create_rides_over_time_csv(concatData) dbm.connect_execute_rides_over_time(rides_over_time) concatData.to_csv( 'data_by_date/000_total_dbscan_analysis_{}_to_{}.csv'.format( saturdays[0], saturdays[-1]))
def MessageBoxShow(selection): if selection is 'all': dB.delete('games') dB.delete('play') dB.delete('players') dB.delete('teams') return st = dB.delete(selection) if st is not 'Success': messagebox.showerror( st, 'The deletion of ' + selection + ' was NOT successful!\n' + st) else: messagebox.showinfo( st, 'The deletion of ' + selection + ' was successful!')
def RetriveContent(documents_id): docs_content = [] for document_id in documents_id: try: tweet_preprocessed = dbm.GetTweetById(document_id[0]) docs_content += [ tweet_preprocessed.split(' '), ] except (AttributeError): next return docs_content
def PrintBubbleChart(seg, cluster_info): from pylab import * from scipy import * import math x = [] y = [] color = [] area = [] for key in cluster_info.keys(): # Number of people inside the cluster x.append(cluster_info[key][1]) # Number of tweets in cluster occurs in database y.append(cluster_info[key][0]) # Proporcional area to the number of people inside the cluster k = cluster_info[key][1] area.append(math.pi * (k)**2) # Color corresponds to the number of occurences of terms in database color.append(cluster_info[key][1]) text(cluster_info[key][1], cluster_info[key][0], cluster_info[key][2], size=7, horizontalalignment='center') sct = scatter(x, y, c=color, s=area, linewidths=2, edgecolor='w') sct.set_alpha(1) label = dbm.GetAccountLabel(seg) xmax = max([x[1] for x in cluster_info.values()]) ymax = max([y[0] for y in cluster_info.values()]) axis([0, xmax + (1 / float(xmax)), 0, ymax + (1 / float(ymax))]) title('%s' % label) xlabel('Numero de follower dentro do cluster', fontsize=16) # This is the sum of occurrences in the database from all # terms inside the cluster ylabel('''Numero de tweets dentro do cluster''', fontsize=16) savefig('../analytics/%s/%s_bubblechart.png' % (label, label)) close()
def DrawDendrogram(arg, labels, seguradora): # Calculate full dendrogram label = dbm.GetAccountLabel(seguradora) plt.title('Dendrograma de %s', label) plt.ylabel('Documentos', fontsize=16) plt.xlabel('Dissimilaridade', fontsize=16) dendrogram( arg, leaf_rotation=0., # rotates the x axis labels leaf_font_size=8., # font size for the x axis labels labels=[' '.join(e) for e in labels], orientation='left') plt.savefig('../analytics/%s/%s_hierarchical_cluster.png' % (label, label)) plt.close()
def Draw2DClusters(arg, seguradora): # >arg< is the dissimilarity matrix mds = MDS(n_components = 2,\ dissimilarity = 'precomputed',\ random_state = 1) pos = mds.fit_transform(arg) plt.scatter(pos[:, 0],\ pos[:, 1],\ c = clusters_colors) plt.title('Documentos e seus clusters') label = dbm.GetAccountLabel(seguradora) plt.savefig('../analytics/%s/%s_partitional_cluster.png' % (label, label)) plt.close()
def Silhouette(X, seguradora): insurance_label = dbm.GetAccountLabel(seguradora) maxx = len(X) if maxx > 11: maxx = 11 range_of_clusters = list(range(2, maxx)) clusters_silhouette = dict() for n_clusters in range_of_clusters: # Initialize the clusterer with n_clusters value #...and a random generator # seed of 10 for reproducibility. clusterer = SKMeans(n_clusters=n_clusters, random_state=0) cluster_labels = clusterer.fit_predict(X) # The silhouette_score gives the average value for all the samples. # This gives a perspective into the density and separation #...of the formed # clusters silhouette_avg = silhouette_score(X, cluster_labels) clusters_silhouette.update({n_clusters: silhouette_avg}) # Compute the silhouette scores for each sample sample_silhouette_values = silhouette_samples(X, cluster_labels) plt.title('Silhueta media de %s' % insurance_label) plt.xlabel('Numero de clusters', fontsize=16) plt.ylabel("Silhueta media", fontsize=16) plt.plot(clusters_silhouette.keys(), clusters_silhouette.values()) plt.savefig("../analytics/%s/%s_silhuette.png" \ % (insurance_label, insurance_label)) plt.close() silhouettes = [v for v in clusters_silhouette.values()] for k, v in clusters_silhouette.iteritems(): if max(silhouettes) == v: return k
def extraCluster(clusters, terms_clusterized, seg, collection, documents_id): # Creates structure in format: # {cluster_index: [frequency of terms in database, people in this cluster]} cluster_info = dict() documents_id = [_[0] for _ in documents_id] label = dbm.GetAccountLabel(seg) cluster_weightered = [] for k, v in clusters.iteritems(): cluster_weightered += [ (k, len(v)), ] # Lists the 25 biggest clusters (with more tweets inside) for _ in sorted(cluster_weightered, key = lambda weight: weight[1],\ reverse = True)[:25]: print '\tO cluster %d tem %d tweet(s)' % (_[0], _[1]) print '''\t\tEsses tweet(s) tem %d termo(s), que sao os seguintes:'''\ % (len(terms_clusterized[_[0]].keys())) cluster_info.update({_[0]: [ _[1], ]}) intraCluster(terms_clusterized[_[0]], seg, collection, cluster_info, _[0], documents_id) print 'cluster info ', cluster_info PrintBubbleChart(seg, cluster_info) ### try: import math PrintTagCloud(seg, [v[2] for v in cluster_info.values()], [math.log10(v[0] * v[1]) for v in cluster_info.values()]) except (TypeError): print 'algum erro em ', cluster_info.values()
def main(): try: name = raw_input('Account name: ') search_account = search_user(name) print 'Resuts:' for i, v in enumerate(search_account): print '%d | %s' % (i + 1, v['name']) option = input('Choose an account (1-10):') user = search_account[option - 1] seguradora_to_file(user['id']) # Outputs to log file logging.info('Getting %s information' % dbm.GetAccountLabel(user['id'])) return user except twitter.api.TwitterHTTPError as e: if str(e).find('401'): m = json.dumps({'code': 401, \ 'message': 'Unauthorized', 'account': user['id']}), \ time.asctime(time.localtime(time.time())) print m # Outputs to log file logging.info(m) elif str(e).find('429'): m = json.dumps({'code': 429, \ 'message': 'Rate limit exceeded'}), \ time.asctime(time.localtime(time.time())) print m # Outputs to log file logging.info(m) print 'Waiting API time' time.sleep(900) return user
def LoadFromDataFrame(seg): label = dbm.GetAccountLabel(seg) df = pd.read_csv('../analytics/%s/%s_sliced.csv' % (label, label)) df = df.dropna(axis='columns', how='all') #df = df.drop('Unnamed: 0', axis = 1) M = pd.DataFrame.as_matrix(df) terms = [term for term in M.T[0]] #d0 = [df['Unnamed: 0.1'][r] for r in df.index] #d1 = [[c] for c in df.columns[1:]] doc_ids = [[c] for c in df.columns[1:]] scores = [row for row in M.T[2:]] del M del df # The content returned is a list of three lists, the first list is a list # of terms, the second is a document id list and the third is a score list # (where which score list corresponds to the scores of terms in each # document) return [terms, doc_ids, scores]
def PrintVennDiagram(A, B, C): # Visualization of the Venn diagram of the 3 biggest insurance-companies from matplotlib_venn import venn3 # Given the insurance-company id. search it followers in database setA = dbm.GetFollowerBySeg(A) setB = dbm.GetFollowerBySeg(B) setC = dbm.GetFollowerBySeg(C) labelA = dbm.GetAccountLabel(A) labelB = dbm.GetAccountLabel(B) labelC = dbm.GetAccountLabel(C) set1 = set(setA) set2 = set(setB) set3 = set(setC) venn3([set1, set2, set3], (labelA, labelB, labelC)) plt.title( 'Diagrama de Venn das 3 maiores seguradoras em numero de follower') plt.savefig('../analytics/venn_diagram.png') plt.close()
tokens_re = re.compile(r'(' + '|'.join(regex_str) + ')', re.VERBOSE | re.IGNORECASE) # Applies regex to stop-words list stop_words = [ ClearTweet(str(word.encode(encoding='UTF-8', errors='strict'))) for word in NL.corpus.stopwords.words('portuguese') ] + ['pra'] # Start time_begin = time.asctime(time.localtime(time.time())) # Collection with all insurance-companies and it's followers # >U< is for Universe, which is a dict which keys are insurance-companies # and the values are the followers U = dbm.GetUniverse() # Creates log file logging.basicConfig(filename = 'preprocessing_results.log', \ level = logging.DEBUG) for insurance_id in U.keys(): # >root_derivations< is a dict which keys are root tokens (stem) and # values are derivations from that stem. i.e, {r, [w1, w2, ..., wK]} # Where r ε {stem(w) | w belongs to >basket_of_terms<} # and w ε >possible_roots<. i.e. {'work': ['worked', 'working']} R = dict() basket = [] # Dict used to create the .csv file # Keys are tweet ids and values are the whole
def RetrieveFollowers(argLst): return list(set([dbm.GetFollowerByTweetId(_) for _ in argLst]))
def PrintResults(): result.config(state=tk.NORMAL) result.delete(1.0, tk.END) st = dB.retrieve(comboBox.get().lower()) result.insert(tk.END, st) result.config(state=tk.DISABLED)
def intraCluster(cluster, seg, collection, cluster_info,\ cluster_index, documents_id): tweets = [] # Realizes the sorting of terms by quantity of occurences inside a cluster items_sorted = sorted([_ for _ in cluster.iteritems()],\ key = lambda freq: freq[1],\ reverse = True) tweetsInCluster = [] # Run through each cluster term which is sorted decrescently for item in items_sorted: try: # List with all tweets which contains the term tweets = [ _ for _ in dbm.GetTweetIdByTerm(item[0], seg) if str(_) in documents_id ] # List with the derivations of stemmed term variations = dbm.GetDerivatives(item[0]) # The w term appears x times in this cluster print '''\t\t\t[%s] aparece %d vez(es) nesse cluster. \t\tEsse termo pode ser: %s''' % (item[0], item[1], variations) if seg in collection.keys(): collection[seg] += [ item[0], ] else: collection[seg] = [ item[0], ] except (Exception): next tweetsInCluster += tweets followers = dbm.GetFollowerByAccount(list(set(tweetsInCluster)), seg) followers = set(followers) - set([seg]) print '''\t\tExistem %d seguidores de %s que usam os termos nesse cluster.'''\ % (len(followers), dbm.GetAccountLabel(seg)) print '\t\tEsses seguidores sao os seguintes ', list(followers) get_der = dbm.GetDerivatives(items_sorted[0][0]) possibilities = items_sorted[0][0] if get_der == None else get_der valid = True for v in cluster_info.values(): if possibilities in v: valid = False if valid: cluster_info[cluster_index] += [len(followers), possibilities] else: cluster_info.pop(cluster_index)
tokens_re = re.compile(r'(' + '|'.join(regex_str) + ')', re.VERBOSE | re.IGNORECASE) # Applies regex to stop-words list stop_words = [ ClearTweet(str(word.encode(encoding='UTF-8', errors='strict'))) for word in NL.corpus.stopwords.words('portuguese') ] + ['pra'] # Start time_begin = time.asctime(time.localtime(time.time())) # Collection with all insurance-companies and it's followers # >U< is for Universe, which is a dict which keys are insurance-companies # and the values are the followers U = dbm.GetUniverse() # Creates log file logging.basicConfig(filename = 'preprocessing_results.log', \ level = logging.DEBUG) for insurance_id in U.keys(): label = dbm.GetAccountLabel(insurance_id) if os.path.isfile('../analytics/%s/%s.csv' % (label, label)): print '%s already preprocessed' % label continue print "Getting followers feed from %s" % label # >followers_feed< is a dict data type where keys are followers
def get_rides_over_time(data): rides_over_time = dfc.find_total_rides_per_day(data) dbm.connect_execute_rides_over_time(rides_over_time)
# Internal imports import sys sys.path.insert(0, '../lib') import DatabaseMethods as dbm # External imports import json import folium import geocoder seguradoras = dbm.GetAllSeguradoras() for seg in seguradoras: label = dbm.GetAccountLabel(seg) print 'Account: %s' % label latitudes = [] longitudes = [] user_names = [] # Lista com tuplas de (usuario, localizacao) locations = dbm.GetLocations(seg) # Aplica expressao regular para limpar a localizacao for i, _ in enumerate(locations): print '%d of %d' \ % (i+1, len(locations)) try: g = geocoder.google(_[1])
def insertFollowerFromFile(id_seguradora, followers, followers_count,\ followers_protected, followers_noTweets, followers_in, control): print 'insertFollowerFromFile()' # Reads from database the whole insurance's companies followers # previously inserted already_in = [_[0] for _ in dbm.GetFollowerBySeg(id_seguradora)] # Removes from followers to be inserted # those, who belongs to this insurance company, already inserted followers = [_ for _ in followers if _ not in already_in] # Marks the position of the present follower # Indicates each index of a list of followers k = 0 # If true, then there are followers to be inserted yet # If false, there are none follower_left = True current_user = dict() for item in followers: if item == -1: # >control< show if all the followers were correctly inserted control += 1 if control >= len(followers): follower_left = False # Recursion stop criterion if follower_left: for follower in followers: # If >follower == -1<, then follower already inserted # Goes to >else< if follower != -1: if available(follower): time.sleep(1) try: user_timeline = \ get_tweets_current_user(follower, 200, 0) if len(user_timeline) != 0: current_user = get_user(follower) insert(current_user, user_timeline, 0,\ id_seguradora) followers_in = followers_in + 1 m = json.dumps({'code': 200, \ 'message': 'Sucess!', \ 'account': follower}), \ time.asctime(time.localtime(time.time())) print m logging.info(m) else: followers_noTweets = followers_noTweets + 1 followers_count -= 1 control += 1 m = json.dumps({'code': 304, \ 'message': 'No tweets available', \ 'account': follower}), \ time.asctime(time.localtime(time.time())) print m logging.info(m) # If follower correctly inserted then it's value # in it's position in the list is replaced by -1 followers[k] = -1 except Exception as e: # Check if returned content is NoneType # which means this follower is protected if str(e).find('NoneType'): followers_protected += 1 followers[k] = -1 followers_count -= 1 control += 1 next else: # If current follower from the current insurance company # had already been inserted, then it only inserts the # user-insurance relantionship. # And ignores the whole follower info InsertInsuranceFollower(follower, id_seguradora) followers_in = followers_in + 1 followers[k] = -1 next else: print k, ' jumped' k += 1 # Uses recurvise call of this function, in a way that # >followers< is passed with those followers # who still need to be inserted insertFollowerFromFile(id_seguradora, followers,\ followers_count, followers_protected,\ followers_noTweets, followers_in, control) else: m = json.dumps({'code':200, \ 'message': 'Finished!', \ 'users': { 'available': followers_count, 'inserted': followers_in, 'no-tweet': followers_noTweets, 'protected': followers_protected}}) # Outputs to log file logging.info(m) raise Exception(m)
def PlotDataFromFile(insurance_id): import matplotlib.pyplot as plt insurance_label = dbm.GetAccountLabel(insurance_id) # Reads the dataframe df = pd.read_csv('%s/%s.csv' % (insurance_label, insurance_label)) # Drops all NaNs columns df = df.dropna(axis='columns', how='all') # Calculates the occurence frequencies of the term in the documents terms = [] terms_freq = [] for row in df.index: occurences = 0 for column in df.columns[1:]: occurences += df[column][row] terms.append(df['Unnamed: 0'][row]) terms_freq.append(occurences) # Pairs term and it's frequencies zipped = zip(terms, terms_freq) # Sorts decrescently terms with it's frequencies zipped_sorted = sorted(zipped, key=lambda t: t[1], reverse=True) # Data array data = np.array([_[1] for _ in zipped_sorted]) data_labels = [_[0] for _ in zipped_sorted] # Finds the quartiles and median q1, median, q3 = np.percentile(data, [25, 50, 75]) print '1st Quartile', q1 print 'Median ', median print '3rd Quartile', q3 # Draws the bars chart. Term per occurence plt.figure(1) plt.bar(np.arange(len(data_labels)), data, align='center', alpha=0.5) plt.xlabel('Terms') plt.ylabel('Occurences') plt.title('Occurrence of terms in documents of %s' % insurance_label) plt.savefig('%s/%s_ocorrenceTerms.png'\ % (insurance_label, insurance_label)) plt.close() # Desenha o boxplot # Draws the boxplot plt.figure(2) plt.title('Boxplot of %s' % insurance_label) bp = plt.boxplot(data) plt.savefig('%s/%s_boxplot.png'\ % (insurance_label, insurance_label)) plt.close() # Draws the violin plt.figure(3) plt.title('Density and occurrence of terms in docs from %s' % insurance_label) plt.xlabel('Density') plt.ylabel('Occurrences') plt.violinplot(data, showmeans=False, showmedians=True) plt.savefig('%s/%s_violinplot.png' % (insurance_label, insurance_label)) CutFile(q3, zipped, df, insurance_label) plt.close() del df
for c in df.columns: if c != 'Unnamed: 0': acc = 0. for r in df.index: acc += df[c][r] if acc == 0.: df = df.drop(c, axis=1) print c, ' removed' df.to_csv('%s/%s_sliced.csv'\ % (label, label)) #insert in database here general_insurances = dbm.GetAllSeguradoras() for insurance_id in general_insurances: if insurance_id in [1202130601]: # correct: not in try: print 'Sliced %s started' % dbm.GetAccountLabel(insurance_id) PlotDataFromFile(insurance_id) print "\t\tSlice done" except (IOError): next except (KeyError), e: with open("message.err", "w") as arq: arq.write("\n[KeyError slicing the file]\n") print "\t\tError reducing dimensionality" print e else:
next del M column_names = [df.columns[index] for index in column_to_remove] for i, name in enumerate(column_names): try: df = df.drop(str(name), axis = 1) print name, ' deleted @ ', i except(IndexError): print 'erro em ', name df.to_csv('%s/%s_sliced.csv' % (label, label)) del df general_insurances = dbm.GetAllSeguradoras() for insurance_id in general_insurances: try: label = dbm.GetAccountLabel(insurance_id) if os.path.isfile('%s/%s_sliced.csv' % (label, label)): print '%s already sliced' % label next else: print 'Sliced %s started' % label PlotDataFromFile(insurance_id) print "Slice done" except(IOError): next except(KeyError), e: with open("message.err", "w") as arq:
def PlotDataFromFile(insurance_id): import matplotlib.pyplot as plt # import matplotlib matplotlib.rcParams.update({'font.size': 24}) # insurance_label = dbm.GetAccountLabel(insurance_id) # Reads the dataframe df = pd.read_csv('%s/%s.csv' % (insurance_label, insurance_label)) # Drops all NaNs columns df = df.dropna(axis = 'columns', how = 'all') # Calculates the occurence frequencies of the term in the documents M = pd.DataFrame.as_matrix(df) terms = [] terms_freq = [] for row in M: terms.append(row[0]) terms_freq.append(sum(list(row[1:]))) del M # Pairs term and it's frequencies zipped = zip(terms, terms_freq) # Sorts decrescently terms with it's frequencies zipped_sorted = sorted(zipped, key = lambda t: t[1], reverse = True) # Data array data = np.array([_[1] for _ in zipped_sorted]) data_labels = [_[0] for _ in zipped_sorted] # Finds the quartiles and median q1, median, q3 = np.percentile(data, [25, 50, 75]) print '1st Quartile', q1 print 'Median ', median print '3rd Quartile', q3 # Draws the bars chart. Term per occurence plt.figure(1) plt.bar(np.arange(len(data_labels)), data, align = 'center', alpha = 0.5) plt.xlabel('Termos', fontsize=16) plt.ylabel('Ocorrencias', fontsize=16) plt.title('Documentos de %s' % insurance_label) plt.savefig('%s/%s_ocorrenceTerms.png' % (insurance_label, insurance_label)) plt.close() # Desenha o boxplot plt.figure(2) plt.title('Boxplot de %s' % insurance_label) bp = plt.boxplot(data) plt.savefig('%s/%s_boxplot.png'\ % (insurance_label, insurance_label)) plt.ylabel('Ocorrencias', fontsize=16) plt.close() # Draws the violin plt.figure(3) plt.title('Termos de %s' % insurance_label) plt.xlabel('Densidade', fontsize=16) plt.ylabel('Ocorrencias', fontsize=16) plt.violinplot(data, showmeans = False, showmedians = True) plt.savefig('%s/%s_violinplot.png' % (insurance_label, insurance_label)) CutFile(q3, zipped, df, insurance_label) plt.close() del df
set1 = set(setA) set2 = set(setB) set3 = set(setC) venn3([set1, set2, set3], (labelA, labelB, labelC)) plt.title( 'Diagrama de Venn das 3 maiores seguradoras em numero de follower') plt.savefig('../analytics/venn_diagram.png') plt.close() # Creates log file logging.basicConfig(filename = 'posprocessing_outputs.log',\ level = logging.DEBUG) seguradoras = dbm.GetAllSeguradoras() collection = dict() followers_count = [(i, len(dbm.GetFollowerBySeg(_)))\ for i, _ in enumerate(seguradoras)] foo = sorted(followers_count, key=lambda x: x[1], reverse=True) try: PrintVennDiagram(seguradoras[foo[0][0]], seguradoras[foo[1][0]],\ seguradoras[foo[2][0]]) except (IndexError): print 'Contas insuficientes para gerar diagrama de Venn' for seguradora in seguradoras: try: label = dbm.GetAccountLabel(seguradora)