def generate_companies_tweets_stats(*, vocab_tweets_index_path: str,
                                    topics_words_csv_path: str,
                                    topics_words_md_path: str):
    topics_words = []
    topics_words_index = {}

    for i, r in enumerate(dictlines_from_csv(topics_words_csv_path)):
        topics_words.append(r)
        topics_words_index[r['Word']] = i

    for row in next_vocab_tweets_index(vocab_tweets_index_path):
        if row['Word'] in topics_words_index:
            i = topics_words_index[row['Word']]
            topics_words[i]['tweets'] = len(row['Tweet_Ids'].split(','))

    fields = ['Word/Name', 'Number of Tweets', 'Meaning']
    fields_align = [MdCellAlign.left, MdCellAlign.left, MdCellAlign.center]
    with open(topics_words_md_path, "w", encoding='utf-8') as f_out:
        md_static_part = f"""{h1('Study Tweets Containing Topics/Words')}
        
{table_header(fields, fields_align)}
"""
        f_out.writelines(md_static_part)
        rank = 1
        for word in topics_words:
            hanzi = word['Word']
            row = [link(hanzi, f"{hanzi}.md"), str(word['tweets']), word['meaning']]
            f_out.writelines(f"{table_row(row)}\n")
            rank += 1
def write_word_card(*, title: str, tweet_data: List[Mapping[str, Mapping]],
                    cards_study_folder: str, tweets_per_page: int):
    num_tweets = len(tweet_data)
    pages = (num_tweets //
             tweets_per_page) + (1 if num_tweets % tweets_per_page > 0 else 0)
    logging.info(
        f"Generating {title}, number of tweets {len(tweet_data)}, pages = {pages}"
    )
    tweets_title_heading = f"Tweets containing {title}"
    word_static_md_part = f"""{h1(title)}

Search {link('mdbg', mdbg_link(title=title))} for definition

Search {link('wiktionary', wiktionary_link(title=title))} for definition

{h3(tweets_title_heading)}

"""
    for page in range(pages):
        word_md_filepath = f"{cards_study_folder}/{word_md_filename(title=title, page=page)}"
        logging.info(f"page {page}, pages {pages}, {word_md_filepath}")
        with open(word_md_filepath, "w", encoding='utf-8') as f_out:
            link_text = word_previous_next_links(title=title,
                                                 page=page,
                                                 pages=pages)

            if link_text:
                f_out.write(f"{link_text}\n")

            f_out.writelines(word_static_md_part)
            low = page * tweets_per_page
            last = low + tweets_per_page
            high = num_tweets if last > num_tweets else last

            for i in range(low, high):
                tweet = tweet_data[i]
                f_out.write(f"{hr()}\n")
                date_source = f"{tweet['Date']} ~ {tweet['Source']}"
                f_out.write(f"{h5(date_source)}\n")
                tweet_text = tweet['Tweet']
                f_out.write(f"{blockquote(tweet_text)}\n")
                f_out.write(
                    f"\n{link('Google Translation', googtrans_link(source_text=tweet_text))}\n"
                )

                if len(tweet['Words']) > 1:
                    f_out.write(
                        f"{h5('Other Words/Names of Interest in the Above Tweet')}\n"
                    )
                    buffer = []
                    other_words = sorted(tweet['Words'].difference({title}))

                    for other_word in other_words:
                        buffer.append(link(other_word, f"{other_word}.md"))

                    f_out.write(f"{', '.join(buffer)}\n")

            if link_text:
                f_out.write(f"____\n\n{link_text}\n")
def word_previous_next_links(*, title: str, page: int, pages: int) -> str:
    """

    :param title:
    :param page: zero-based
    :param pages: total number of pages
    :return:
    """
    if pages == 1:
        return ''

    next_page = link("Next Page", f"{title}-{page+1:02}.md")
    prev_page = link("Previous Page",
                     word_md_filename(title=title, page=page - 1))

    if page == 0:
        return next_page
    elif page == pages - 1:
        return prev_page
    else:
        return f"{prev_page} | {next_page}"
Esempio n. 4
0
def generate_vocab_tweets_stats(*, vocab_tweets_index_path: str,
                                cards_study_folder: str):
    vocab_tweets = [(row['Word'], len(row['Tweet_Ids'].split(',')))
                    for row in next_vocab_tweets_index(vocab_tweets_index_path)
                    ]
    vocab_tweets.sort(key=lambda data: data[1], reverse=True)
    fields = ['Rank', 'Word/Name', 'Number of Tweets']
    fields_align = [MdCellAlign.left, MdCellAlign.center, MdCellAlign.center]
    with open(f"{cards_study_folder}/words_tweets_stats.md",
              "w",
              encoding='utf-8') as f_out:
        md_static_part = f"""{h1('Study Words/Names Statistics')}
        
{table_header(fields, fields_align)}
"""
        f_out.writelines(md_static_part)
        rank = 1
        for word, num_tweets in vocab_tweets:
            row = [str(rank), link(word, f"{word}.md"), str(num_tweets)]
            f_out.writelines(f"{table_row(row)}\n")
            rank += 1
Esempio n. 5
0
def generate_companies_tweets_stats(*, vocab_tweets_index_path: str,
                                    cards_study_folder: str,
                                    companies_csv_path: str):
    companies_tickers = []
    companies_index = {}

    for i, r in enumerate(dictlines_from_csv(companies_csv_path)):
        companies_tickers.append(r)
        companies_index[r['Chinese Name']] = i

    for row in next_vocab_tweets_index(vocab_tweets_index_path):
        if row['Word'] in companies_index:
            i = companies_index[row['Word']]
            companies_tickers[i]['tweets'] = len(row['Tweet_Ids'].split(','))

    fields = [
        'Chinese Name', 'English Name', 'U.S. Ticker', 'Number of Tweets'
    ]
    fields_align = [
        MdCellAlign.left, MdCellAlign.left, MdCellAlign.center,
        MdCellAlign.center
    ]
    with open(f"{cards_study_folder}/companies_tweets_stats.md",
              "w",
              encoding='utf-8') as f_out:
        md_static_part = f"""{h1('Companies/Brands  ')}
        
{table_header(fields, fields_align)}
"""
        f_out.writelines(md_static_part)
        rank = 1
        for company in companies_tickers:
            hanzi = company['Chinese Name']
            row = [
                link(hanzi, f"{hanzi}.md"), company['English Name'],
                company['Ticker'],
                str(company['tweets'])
            ]
            f_out.writelines(f"{table_row(row)}\n")
            rank += 1
def generate_curated_words_study(*, words_toml_path: str,
                                 curated_words_study_path: str):
    _md_template = [
        f"""{h1(f"A Subset of Curated Words Extracted From Tweets")}

To learn how the words are used in context, read the tweets by clicking or tapping on the Chinese words. For a complete 
list of the curated words, check it out {link('here', 'words_tweets_stats.md')}.
"""
    ]

    toml_words = toml.load(words_toml_path)
    fields_align = [MdCellAlign.left, MdCellAlign.center]
    pprint(toml_words)
    for name, heading in toml_words['category-names'].items():
        _md_template.append(h2(heading))
        _md_template.append(table_header(['', ''], fields_align))
        for word in toml_words[name]:
            _md_template.append(
                table_row([link(word['hz'], f"{word['hz']}.md"), word['en']]))

    with open(curated_words_study_path, "w") as fh:
        fh.writelines('\n'.join(_md_template))
Esempio n. 7
0
    def write(self, md_filepath: str, title: str, reverse_sort: bool = True):
        self._report_data.sort(key=lambda tw: tw.tweet_date,
                               reverse=reverse_sort)
        mdtb_rows = []

        for attrs in self._report_data:
            link = googtrans_link(source_text=attrs.tweet_text)
            mdtb_rows.append(
                md.table_row([
                    str(attrs.tweet_date), attrs.tweet_source,
                    md.link(attrs.tweet_text, link)
                ]))

        tbl = '\n'.join(mdtb_rows)
        report = f"""## {title} 

Tweets with [{title}](https://en.wiktionary.org/wiki/{title}). Tap or click to check if Wiktionary has an entry for it.

| UTC Date | Tweet Source | Tweet (click or tap to see Google Translation) |
|:-----------------|:-------------|:------------------|  
{tbl}
"""
        with open(md_filepath, "w") as f_out:
            f_out.writelines(report)
def new_write_word_card(summarized_tweets_words: Mapping[str, Mapping],
                        cards_study_folder: str, tweets_per_page: int,
                        word_tweets: WordAndTweets):
    tweet_data = [
        summarized_tweets_words[tweet_id] for tweet_id in word_tweets.tweet_ids
    ]
    tweet_data.sort(key=lambda tw: tw['Date'], reverse=True)
    num_tweets = len(tweet_data)
    title = word_tweets.word
    pages = (num_tweets //
             tweets_per_page) + (1 if num_tweets % tweets_per_page > 0 else 0)
    logging.info(
        f"Generating {title}, number of tweets {len(tweet_data)}, pages = {pages}"
    )
    tweets_title_heading = f"Tweets containing {title}"
    word_static_md_part = f"""{h1(title)}

Search {link('mdbg', mdbg_link(title=title))} for definition

Search {link('wiktionary', wiktionary_link(title=title))} for definition

{h3(tweets_title_heading)}

"""
    for page in range(pages):
        word_md_filepath = f"{cards_study_folder}/{word_md_filename(title=title, page=page)}"
        logging.info(f"page {page}, pages {pages}, {word_md_filepath}")
        with open(word_md_filepath, "w", encoding='utf-8') as f_out:
            link_text = word_previous_next_links(title=title,
                                                 page=page,
                                                 pages=pages)

            md_page_buffer = [
                f"{link_text}\n{word_static_md_part}"
                if link_text else word_static_md_part
            ]
            low = page * tweets_per_page
            last = low + tweets_per_page
            high = num_tweets if last > num_tweets else last

            for i in range(low, high):
                tweet = tweet_data[i]
                date_source = f"{tweet['Date']} ~ {tweet['Source']}"
                tweet_text = tweet['Tweet']
                md_tweet_body = f"""{hr()}

{h5(date_source)}

{blockquote(tweet['Tweet'])}

{link('Google Translation', googtrans_link(source_text=tweet_text))}

"""
                md_page_buffer.append(md_tweet_body)

                if len(tweet['Words']) > 1:
                    buffer = []
                    other_words = sorted(tweet['Words'].difference({title}))

                    for other_word in other_words:
                        buffer.append(link(other_word, f"{other_word}.md"))

                    md_other_words = f"""{h5('Other Words/Names of Interest in the Above Tweet')}

{', '.join(buffer)}

"""
                    md_page_buffer.append(md_other_words)

            if link_text:
                md_bottom_link = f"""____

{link_text}

"""
                md_page_buffer.append(md_bottom_link)

            f_out.writelines(md_page_buffer)