def main(): try: args = parse_args() if args.about: print(get_about_info()) return dbpath = args.path if args.path else DB_PATH db = database.DataBase(dbpath) public_id = db.get_public_id() set_output_path('%s/%s/' % (get_output_path(), public_id)) if not os.path.isdir(get_output_path()): os.makedirs(get_output_path()) if args.clear_output: #clean output contents files = glob.glob('%s/*' % (get_output_path())) for f in files: os.remove(f) common.common_data(db) common.alltop_data(db, 10) common.zero_data(db) common.authors_data(db) attachments.attachments_data(db) attachments.polls_info(db, 20) text_parse.popular_words(db, 200) text_parse.get_topics(db) timing.drawplots(db) except BaseException as e: print(e) traceback.print_exc() exit(1)
def popular_words(db, top_count): print_info('Searching popular words...') pattern = re.compile("^[a-zA-Zа-яА-Я0-9_]+$") alltext = db.select_all_text() #whole plain text words_data = preprocess_text(alltext) #list of preprocessed words allwords_text = ' '.join(words_data) #text with preprocessed words words_data = [x for x in words_data if pattern.match(x)] #remove non-words sorted_words_data = sorted(Counter(words_data).items(), key=lambda kv: kv[1], reverse=True) top_words = sorted_words_data[:top_count] #list of tuples of top words f = open(get_output_path() + "top_words.csv","w", encoding="utf-8") f.write('Word;Count\n') headers = ['Word', 'Count'] print("\nTop words:") table_values = [] for word in top_words: f.write('%s;%d\n' % (word[0], word[1])) table_values.append(word) f.close() print(tabulate.tabulate(table_values, headers=headers, numalign="right")) print_info('Drawing wordclouds') make_wordcloud(allwords_text, get_output_path() + 'allwords.png') make_wordcloud(word_data_to_text(top_words), get_output_path() + 'topwords.png') make_wordcloud(' '.join(get_hashtags(alltext)), get_output_path() + 'hashtags.png') print_info('Done')
def polls_info(db, count): polls = db.get_polls() votes = [int(x[6]) for x in polls] length = len(votes) total_votes = sum(votes) average = total_votes / length try: mode = statistics.mode(votes) except: c = Counter(votes) mode = c.most_common(1)[0][0] headers = [ 'Parameter', 'Count', 'Total votes', 'Average (Mean)', 'Median', 'Mode', 'Stdev' ] values = [ 'Polls', length, total_votes, average, statistics.median(votes), mode, statistics.pstdev(votes) ] print("\nPolls data:") print( tabulate.tabulate([values], headers=headers, floatfmt=".4g", numalign="right")) f = open(get_output_path() + "common_polls.csv", "w", encoding="utf-8") f.write(";".join(headers) + '\n') f.write('%s;%d;%d;%.4g;%.4g;%.4g;%.4g\n' % (values[0], values[1], values[2], values[3], values[4], values[5], values[6])) f.close() print("\nTop polls:") headers = ['URL', 'Votes'] table_values = [] f = open(get_output_path() + "polls.csv", "w", encoding="utf-8") f.write(";".join(headers) + '\n') for i, _ in enumerate(polls): values = [polls[i][4], polls[i][6]] f.write('%s;%d\n' % (values[0], int(values[1]))) if i <= count: table_values.append(values) f.close() print( tabulate.tabulate(table_values, headers=headers, floatfmt=".4g", numalign="right"))
def common_data(db): data, names, columns = db.get_common_data() f = open(get_output_path() + "common.csv", "w", encoding="utf-8") headers = [ 'Parameter', 'Count', 'Average (Mean)', 'Median', 'Mode', 'Stdev' ] header = ";".join(headers) f.write(header + '\n') print("\nCommon data:") count = data[0] column_count = 0 table_values = [] for i, value in enumerate(data): if i > 0 and column_count < len(columns): table_values.append( common_data_row(db.get_column_data(columns[column_count]), value, names[i], count, f)) column_count += 1 else: values = [names[i], value] f.write('%s;%d\n' % (values[0], values[1])) table_values.append(values) data_values = db.get_texts_length() table_values.append( common_data_row(data_values, sum(data_values), "Text", count, f)) f.close() print( tabulate.tabulate(table_values, headers=headers, floatfmt=".4g", numalign="right"))
def authors_data(db): data = db.get_posts_by_authors() f = open(get_output_path() + "authors.csv", "w", encoding="utf-8") headers = [ 'Author id', 'Posts', 'Likes', 'Reposts', 'Comments', 'Views', 'Attachments', 'Text length' ] header = ";".join(headers) f.write(header + '\n') print("\nAuthors data:") table_values = [] for i, _ in enumerate(data): values = [ data[i][0], data[i][1], data[i][2], data[i][3], data[i][4], data[i][5], data[i][6], data[i][7] ] f.write('%d;%d;%d;%d;%d;%d;%d;%d\n' % (values[0], values[1], values[2], values[3], values[4], values[5], values[6], values[7])) if i <= 20: #Show only top 20 authors table_values.append(values) f.close() print( tabulate.tabulate(table_values, headers=headers, floatfmt=".4g", numalign="right"))
def get_dateposts(name, data, data_range, autolocator=False): x = data_range y1 = [posts_count(data, str(i)) for i in x] #posts y2 = [get_average(data, str(i), 1) for i in x] #likes y3 = [get_average(data, str(i), 2) for i in x] #reposts y4 = [get_average(data, str(i), 3) for i in x] #comments y5 = [get_average(data, str(i), 4) for i in x] #views y6 = [get_average(data, str(i), 5) for i in x] #attachments y7 = [get_average(data, str(i), 6) for i in x] #text length host = host_subplot(111, axes_class=AA.Axes) plt.subplots_adjust(right=0.65, bottom=0.15, left=0.05) plt.ticklabel_format(useOffset=False) new_fixed_axis = host.get_grid_helper().new_fixed_axis plt.xticks([]) x_range = [i for i in range(len(x))] host.tick_params(labelrotation=45) host.set_xticks(x_range) host.set_xticklabels(x) if autolocator: plt.gca().xaxis.set_major_locator(ticker.AutoLocator()) plt.gca().xaxis.set_major_formatter( ticker.FuncFormatter(lambda i, pos: get_element(x, i))) host.set_ylabel("posts") p1, = host.plot(x_range, y1, marker='o', label='posts') host.axis["left"].label.set_color(p1.get_color()) draw_subplot(host, new_fixed_axis, x_range, y2, 0, 'likes / post', '^') draw_subplot(host, new_fixed_axis, x_range, y3, 60, 'reposts / post', 'D') draw_subplot(host, new_fixed_axis, x_range, y4, 120, 'comments / post', 'v') draw_subplot(host, new_fixed_axis, x_range, y5, 180, 'views / post', '.') draw_subplot(host, new_fixed_axis, x_range, y6, 240, 'attachments / post', 's') draw_subplot(host, new_fixed_axis, x_range, y7, 300, 'text length / post', 'X') host.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=7) plt.grid(True) fig = plt.gcf() fig.set_size_inches(15, 6) plt.savefig(get_output_path() + name) plt.close()
def get_topic_by_year(db, year=None): pattern = re.compile("^[a-zA-Zа-яА-Я0-9_]+$") alltext = db.select_all_text(year) words_data = preprocess_text(alltext) words_data = [x for x in words_data if pattern.match(x)] #remove non-words if len(words_data) == 0: print_info("Empty dataset!") return text_data = [words_data] dictionary = corpora.Dictionary(text_data) corpus = [dictionary.doc2bow(text) for text in text_data] data = models.ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=1, passes=30) topics = data.print_topics(num_words=10) if year: print_info("Topics for %d" % year) name = get_output_path() + "topics_%d" % year else: print_info("Common topics") name = get_output_path() + "topics" f = open(name + ".csv", "w", encoding="utf-8") f.write('Weight;Word\n') topic_names = [] topic_data = [] for topic in topics: topic_words = topic[1].split('+') for i in topic_words: result = i.replace(" ", "") result = result.replace("\"", "") values = result.split('*') topic_data.append(float(values[0])) topic_names.append(values[1]) f.write('%s;%s\n' % (values[0], values[1])) print(topic[1]) f.close() topics_plot(topic_names, topic_data, name + ".png")
def top_data(name, max_values): '''Show top data''' f = open(get_output_path() + "extremum_%s.csv" % (name), "w", encoding="utf-8") headers = ['Post id', 'Max'] header = ";".join(headers) f.write(header + '\n') print("\n%s extremum data:" % (name)) table_values = [] for i in range(len(max_values)): values = [max_values[i][1], max_values[i][0]] f.write('%d;%d\n' % (values[0], values[1])) table_values.append(values) f.close() print(tabulate.tabulate(table_values, headers=headers, numalign="right"))
def attachments_data(db): data = db.get_attachments_types() f = open(get_output_path() + "attachments.csv", "w", encoding="utf-8") headers = ['Parameter', 'Count'] header = ";".join(headers) f.write(header + '\n') print("\nAttachments data:") table_values = [] for value in data: values = [db.get_attachments_name(value[0]), value[1]] f.write('%s;%d\n' % (values[0], values[1])) table_values.append(values) f.close() print( tabulate.tabulate(table_values, headers=headers, floatfmt=".4g", numalign="right"))
def zero_data(db): names = ('Likes', 'Reposts', 'Comments', 'Attachments') columns = ('likes_count', 'reposts_count', 'comments_count', 'attachments_count') f = open(get_output_path() + "zeroes.csv", "w", encoding="utf-8") headers = ['Parameter', 'Count'] header = ";".join(headers) f.write(header + '\n') print("\nPosts without:") table_values = [] for i in range(len(names)): values = [names[i], db.get_zero_data(columns[i])] f.write('%s;%d\n' % (values[0], values[1])) table_values.append(values) values = ['Text', db.get_zero_texts()] f.write('%s;%d\n' % (values[0], values[1])) table_values.append(values) f.close() print(tabulate.tabulate(table_values, headers=headers, numalign="right"))