Beispiel #1
0
def main():
    try:
        args = parse_args()
        if args.about:
            print(get_about_info())
            return
        dbpath = args.path if args.path else DB_PATH
        db = database.DataBase(dbpath)
        public_id = db.get_public_id()
        set_output_path('%s/%s/' % (get_output_path(), public_id))
        if not os.path.isdir(get_output_path()):
            os.makedirs(get_output_path())
        if args.clear_output: #clean output contents
            files = glob.glob('%s/*' % (get_output_path()))
            for f in files:
                os.remove(f)
        common.common_data(db)
        common.alltop_data(db, 10)
        common.zero_data(db)
        common.authors_data(db)
        attachments.attachments_data(db)
        attachments.polls_info(db, 20)
        text_parse.popular_words(db, 200)
        text_parse.get_topics(db)
        timing.drawplots(db)
    except BaseException as e:
        print(e)
        traceback.print_exc()
        exit(1)
Beispiel #2
0
def popular_words(db, top_count):
    print_info('Searching popular words...')
    pattern = re.compile("^[a-zA-Zа-яА-Я0-9_]+$")
    alltext = db.select_all_text() #whole plain text
    words_data = preprocess_text(alltext) #list of preprocessed words
    allwords_text = ' '.join(words_data) #text with preprocessed words
    words_data = [x for x in words_data if pattern.match(x)] #remove non-words
    sorted_words_data = sorted(Counter(words_data).items(), key=lambda kv: kv[1], reverse=True)
    top_words = sorted_words_data[:top_count] #list of tuples of top words

    f = open(get_output_path() + "top_words.csv","w", encoding="utf-8")
    f.write('Word;Count\n')
    headers = ['Word', 'Count']
    print("\nTop words:")
    table_values = []
    for word in top_words:
        f.write('%s;%d\n' % (word[0], word[1]))
        table_values.append(word)
    f.close()
    print(tabulate.tabulate(table_values, headers=headers, numalign="right"))

    print_info('Drawing wordclouds')
    make_wordcloud(allwords_text, get_output_path() + 'allwords.png')
    make_wordcloud(word_data_to_text(top_words), get_output_path() + 'topwords.png')
    make_wordcloud(' '.join(get_hashtags(alltext)), get_output_path() + 'hashtags.png')
    print_info('Done')
def polls_info(db, count):
    polls = db.get_polls()
    votes = [int(x[6]) for x in polls]
    length = len(votes)
    total_votes = sum(votes)
    average = total_votes / length
    try:
        mode = statistics.mode(votes)
    except:
        c = Counter(votes)
        mode = c.most_common(1)[0][0]
    headers = [
        'Parameter', 'Count', 'Total votes', 'Average (Mean)', 'Median',
        'Mode', 'Stdev'
    ]
    values = [
        'Polls', length, total_votes, average,
        statistics.median(votes), mode,
        statistics.pstdev(votes)
    ]
    print("\nPolls data:")
    print(
        tabulate.tabulate([values],
                          headers=headers,
                          floatfmt=".4g",
                          numalign="right"))
    f = open(get_output_path() + "common_polls.csv", "w", encoding="utf-8")
    f.write(";".join(headers) + '\n')
    f.write('%s;%d;%d;%.4g;%.4g;%.4g;%.4g\n' %
            (values[0], values[1], values[2], values[3], values[4], values[5],
             values[6]))
    f.close()
    print("\nTop polls:")
    headers = ['URL', 'Votes']
    table_values = []
    f = open(get_output_path() + "polls.csv", "w", encoding="utf-8")
    f.write(";".join(headers) + '\n')
    for i, _ in enumerate(polls):
        values = [polls[i][4], polls[i][6]]
        f.write('%s;%d\n' % (values[0], int(values[1])))
        if i <= count:
            table_values.append(values)
    f.close()
    print(
        tabulate.tabulate(table_values,
                          headers=headers,
                          floatfmt=".4g",
                          numalign="right"))
Beispiel #4
0
def common_data(db):
    data, names, columns = db.get_common_data()
    f = open(get_output_path() + "common.csv", "w", encoding="utf-8")
    headers = [
        'Parameter', 'Count', 'Average (Mean)', 'Median', 'Mode', 'Stdev'
    ]
    header = ";".join(headers)
    f.write(header + '\n')
    print("\nCommon data:")
    count = data[0]
    column_count = 0
    table_values = []
    for i, value in enumerate(data):
        if i > 0 and column_count < len(columns):
            table_values.append(
                common_data_row(db.get_column_data(columns[column_count]),
                                value, names[i], count, f))
            column_count += 1
        else:
            values = [names[i], value]
            f.write('%s;%d\n' % (values[0], values[1]))
            table_values.append(values)

    data_values = db.get_texts_length()
    table_values.append(
        common_data_row(data_values, sum(data_values), "Text", count, f))

    f.close()
    print(
        tabulate.tabulate(table_values,
                          headers=headers,
                          floatfmt=".4g",
                          numalign="right"))
Beispiel #5
0
def authors_data(db):
    data = db.get_posts_by_authors()
    f = open(get_output_path() + "authors.csv", "w", encoding="utf-8")
    headers = [
        'Author id', 'Posts', 'Likes', 'Reposts', 'Comments', 'Views',
        'Attachments', 'Text length'
    ]
    header = ";".join(headers)
    f.write(header + '\n')
    print("\nAuthors data:")
    table_values = []
    for i, _ in enumerate(data):
        values = [
            data[i][0], data[i][1], data[i][2], data[i][3], data[i][4],
            data[i][5], data[i][6], data[i][7]
        ]
        f.write('%d;%d;%d;%d;%d;%d;%d;%d\n' %
                (values[0], values[1], values[2], values[3], values[4],
                 values[5], values[6], values[7]))
        if i <= 20:  #Show only top 20 authors
            table_values.append(values)
    f.close()
    print(
        tabulate.tabulate(table_values,
                          headers=headers,
                          floatfmt=".4g",
                          numalign="right"))
Beispiel #6
0
def get_dateposts(name, data, data_range, autolocator=False):
    x = data_range
    y1 = [posts_count(data, str(i)) for i in x]  #posts
    y2 = [get_average(data, str(i), 1) for i in x]  #likes
    y3 = [get_average(data, str(i), 2) for i in x]  #reposts
    y4 = [get_average(data, str(i), 3) for i in x]  #comments
    y5 = [get_average(data, str(i), 4) for i in x]  #views
    y6 = [get_average(data, str(i), 5) for i in x]  #attachments
    y7 = [get_average(data, str(i), 6) for i in x]  #text length

    host = host_subplot(111, axes_class=AA.Axes)
    plt.subplots_adjust(right=0.65, bottom=0.15, left=0.05)
    plt.ticklabel_format(useOffset=False)
    new_fixed_axis = host.get_grid_helper().new_fixed_axis

    plt.xticks([])
    x_range = [i for i in range(len(x))]
    host.tick_params(labelrotation=45)
    host.set_xticks(x_range)
    host.set_xticklabels(x)
    if autolocator:
        plt.gca().xaxis.set_major_locator(ticker.AutoLocator())
        plt.gca().xaxis.set_major_formatter(
            ticker.FuncFormatter(lambda i, pos: get_element(x, i)))

    host.set_ylabel("posts")
    p1, = host.plot(x_range, y1, marker='o', label='posts')
    host.axis["left"].label.set_color(p1.get_color())

    draw_subplot(host, new_fixed_axis, x_range, y2, 0, 'likes / post', '^')
    draw_subplot(host, new_fixed_axis, x_range, y3, 60, 'reposts / post', 'D')
    draw_subplot(host, new_fixed_axis, x_range, y4, 120, 'comments / post',
                 'v')
    draw_subplot(host, new_fixed_axis, x_range, y5, 180, 'views / post', '.')
    draw_subplot(host, new_fixed_axis, x_range, y6, 240, 'attachments / post',
                 's')
    draw_subplot(host, new_fixed_axis, x_range, y7, 300, 'text length / post',
                 'X')

    host.legend(loc='upper center',
                bbox_to_anchor=(0.5, -0.05),
                fancybox=True,
                shadow=True,
                ncol=7)
    plt.grid(True)

    fig = plt.gcf()
    fig.set_size_inches(15, 6)
    plt.savefig(get_output_path() + name)
    plt.close()
Beispiel #7
0
def get_topic_by_year(db, year=None):
    pattern = re.compile("^[a-zA-Zа-яА-Я0-9_]+$")
    alltext = db.select_all_text(year)
    words_data = preprocess_text(alltext)
    words_data = [x for x in words_data if pattern.match(x)] #remove non-words
    if len(words_data) == 0:
        print_info("Empty dataset!")
        return
    text_data = [words_data]
    dictionary = corpora.Dictionary(text_data)
    corpus = [dictionary.doc2bow(text) for text in text_data]
    data = models.ldamodel.LdaModel(corpus, id2word=dictionary,
        num_topics=1, passes=30)
    topics = data.print_topics(num_words=10)
    if year:
        print_info("Topics for %d" % year)
        name = get_output_path() + "topics_%d" % year
    else:
        print_info("Common topics")
        name = get_output_path() + "topics"
    f = open(name + ".csv", "w", encoding="utf-8")
    f.write('Weight;Word\n')
    topic_names = []
    topic_data = []
    for topic in topics:
        topic_words = topic[1].split('+')
        for i in topic_words:
            result = i.replace(" ", "")
            result = result.replace("\"", "")
            values = result.split('*')
            topic_data.append(float(values[0]))
            topic_names.append(values[1])
            f.write('%s;%s\n' % (values[0], values[1]))
        print(topic[1])
    f.close()
    topics_plot(topic_names, topic_data, name + ".png")
Beispiel #8
0
def top_data(name, max_values):
    '''Show top data'''
    f = open(get_output_path() + "extremum_%s.csv" % (name),
             "w",
             encoding="utf-8")
    headers = ['Post id', 'Max']
    header = ";".join(headers)
    f.write(header + '\n')
    print("\n%s extremum data:" % (name))
    table_values = []
    for i in range(len(max_values)):
        values = [max_values[i][1], max_values[i][0]]
        f.write('%d;%d\n' % (values[0], values[1]))
        table_values.append(values)
    f.close()
    print(tabulate.tabulate(table_values, headers=headers, numalign="right"))
def attachments_data(db):
    data = db.get_attachments_types()
    f = open(get_output_path() + "attachments.csv", "w", encoding="utf-8")
    headers = ['Parameter', 'Count']
    header = ";".join(headers)
    f.write(header + '\n')
    print("\nAttachments data:")
    table_values = []
    for value in data:
        values = [db.get_attachments_name(value[0]), value[1]]
        f.write('%s;%d\n' % (values[0], values[1]))
        table_values.append(values)
    f.close()
    print(
        tabulate.tabulate(table_values,
                          headers=headers,
                          floatfmt=".4g",
                          numalign="right"))
Beispiel #10
0
def zero_data(db):
    names = ('Likes', 'Reposts', 'Comments', 'Attachments')
    columns = ('likes_count', 'reposts_count', 'comments_count',
               'attachments_count')
    f = open(get_output_path() + "zeroes.csv", "w", encoding="utf-8")
    headers = ['Parameter', 'Count']
    header = ";".join(headers)
    f.write(header + '\n')
    print("\nPosts without:")
    table_values = []
    for i in range(len(names)):
        values = [names[i], db.get_zero_data(columns[i])]
        f.write('%s;%d\n' % (values[0], values[1]))
        table_values.append(values)
    values = ['Text', db.get_zero_texts()]
    f.write('%s;%d\n' % (values[0], values[1]))
    table_values.append(values)
    f.close()
    print(tabulate.tabulate(table_values, headers=headers, numalign="right"))