Esempio n. 1
0
def main():
    texts = pd.read_csv(os.path.join(DATA_DIR, "texts-news.csv"), chunksize=1000)
    normilizer = GasparettiTextNormalizer()
    counter_docs_with_word = Counter()
    total_num_of_docs = 0
    for chunk in texts:
        for index, row in chunk.iterrows():
            total_num_of_docs += 1
            text = row["text"]
            if isinstance(text, str):
                words_in_doc = set(itertools.chain(*normilizer.normalized_sentences(text)))
                for word in words_in_doc:
                    counter_docs_with_word[word] += 1

    with open(os.path.join(DATA_DIR, 'idf_dragnet.txt'), 'w+') as fin:
        _, words = FileReadUtil.load_fasttext(os.path.join(DATA_DIR, "news_dragnet.vec"))
        fin.write(str(len(words)) + '\n')
        for word in words:
            c = counter_docs_with_word[word]
            if c == 0:
                idf = 0
            else:
                idf = np.log(total_num_of_docs / c)
            fin.write(str(idf))
            fin.write('\n')
Esempio n. 2
0
def generate_data_for_bigartm(file_path):
    texts = pd.read_csv(file_path, chunksize=10000)
    normilizer = GasparettiTextNormalizer()
    if not os.path.exists(os.path.join(BIGARTM_DIR, "gasparetti.txt")):
        with open(os.path.join(BIGARTM_DIR, "gasparetti.txt"), "w+") as f:
            for chunk in texts:
                for index, row in chunk.iterrows():
                    text = normilizer.normalized_sentences(row["text"])
                    url = row["url"]
                    words_in_doc = itertools.chain(*text)
                    f.write(url + " " + " ".join(words_in_doc) + "\n")
Esempio n. 3
0
def compute_score_topic_modeling(score_cmp=None,
                                 min_sentence_len=6,
                                 topic_cos_threshold=0.7,
                                 news_clustering_threshold=0.025,
                                 news_clustering_min_cluster_size=4,
                                 stories_clustering_threshold=0.25,
                                 stories_clustering_min_cluster_size=2,
                                 ngrams_for_topics_labelling=3,
                                 stories_connecting_cos_threshold=0.6,
                                 story_window=4,
                                 lexic_result_word_num=10,
                                 sclale_dist=100,
                                 verbose=False,
                                 input_file_path="gasparetti_small.csv",
                                 start='10.03.2014',
                                 end='26.03.2014'):
    articles_input = NewsGasparettiInput(input_file_path)
    text_normalizer = GasparettiTextNormalizer()

    start = datetime.datetime.strptime(start, '%d.%m.%Y').replace(tzinfo=datetime.timezone.utc)
    end = datetime.datetime.strptime(end, '%d.%m.%Y').replace(tzinfo=datetime.timezone.utc)

    embedding_file_path = os.path.join(MODELS_DIR, "news_dragnet.vec")
    idf_file_path = os.path.join(MODELS_DIR, 'idf_dragnet.txt')
    cluster_centroids_file_path = os.path.join(MODELS_DIR, 'cluster_centroids_filtered.txt')
    cluster_names_file_path = os.path.join(MODELS_DIR, 'cluster_names_filtered.txt')
    topics_matching_file_path = os.path.join(MODELS_DIR, 'topic_matching.txt')

    params_logging_str = f"FROM_DATE: {start}\n" \
                         f"TO_DATE: {end}\n\n" \
                         f"EMBEDDING_FILE_PATH: {embedding_file_path}\n" \
                         f"IDF_FILE_PATH: {idf_file_path}\n" \
                         f"CLUSTER_CENTROIDS_FILE_PATH: {cluster_centroids_file_path}\n\n" \
                         f"MIN_SENTENCE_LEN: {min_sentence_len}\n" \
                         f"TOPIC_COS_THRESHOLD: {topic_cos_threshold}\n" \
                         f"NEWS_CLUSTERING_THRESHOLD: {news_clustering_threshold}\n" \
                         f"NEWS_CLUSTERING_MIN_CLUSTER_SIZE: {news_clustering_min_cluster_size}\n" \
                         f"STORIES_CLUSTERING_THRESHOLD: {stories_clustering_threshold}\n" \
                         f"STORIES_CLUSTERING_MIN_CLUSTER_SIZE: {stories_clustering_min_cluster_size}\n" \
                         f"NGRAMS_FOR_TOPICS_LABELLING: {ngrams_for_topics_labelling}\n" \
                         f"STORIES_CONNECTING_COS_THRESHOLD: {stories_connecting_cos_threshold}\n" \
                         f"STORY_WINDOW: {story_window}\n" \
                         f"LEXIC_RESULT_WORD_NUM: {lexic_result_word_num}\n" \
                         f"SCALE_DIST: {sclale_dist}\n"
    logging.info('Parameters used:\n' + params_logging_str)
    processor = TopicsScript(
        StartupParams(start, end),
        ProcessingParams(embedding_file_path, idf_file_path, cluster_centroids_file_path,
                         cluster_names_file_path, topics_matching_file_path, min_sentence_len,
                         topic_cos_threshold,
                         news_clustering_threshold,
                         news_clustering_min_cluster_size, stories_clustering_threshold,
                         stories_clustering_min_cluster_size, ngrams_for_topics_labelling,
                         stories_connecting_cos_threshold, story_window, lexic_result_word_num, sclale_dist))
    topic_news = processor.run(articles_input, text_normalizer, verbose=verbose)
    dict_clusters = dict()
    for cluster_id in topic_news:
        articles = topic_news[cluster_id]
        for article in articles:
            dict_clusters[article.id] = cluster_id

    output_clusters = pd.DataFrame(columns=["url", "timestamp", "story_id_predicted", "story_id"])
    for index, row in articles_input.df.iterrows():
        cluster_id = dict_clusters.get(row["url"], "0")
        output_clusters.loc[index] = [row["url"], row["timestamp"], cluster_id, row["story"]]
    if score_cmp:
        score = score_cmp.compute_score(output_clusters["story_id_predicted"].to_list())
        logging.info('Score : ' + str(score) + "\n")
Esempio n. 4
0
import pandas as pd
import numpy as np
import os
import requests
import re
from com.expleague.media_space.topics.embedding_model import GasparettiTextNormalizer

DATA_DIR = os.path.join(
    os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "data")

if __name__ == "__main__":
    texts = pd.read_csv(os.path.join(DATA_DIR, "parsed-texts-dragnet.csv"),
                        chunksize=1000)
    normilizer = GasparettiTextNormalizer()
    with open(os.path.join(DATA_DIR, 'data.txt'), 'w+') as the_file:
        for chunk in texts:
            for index, row in chunk.iterrows():
                text = row["text"]
                if isinstance(text, str):
                    for words in normilizer.normalized_sentences(text):
                        the_file.write(" ".join(words) + " ")
            the_file.flush()
Esempio n. 5
0
import pandas as pd
import numpy as np
import os
from com.expleague.media_space.topics.embedding_model import GasparettiTextNormalizer
import itertools

DATA_DIR = os.path.join(
    os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "data")

if __name__ == "__main__":
    texts = pd.read_csv(os.path.join(DATA_DIR, "parsed-texts-dragnet.csv"))
    normilizer = GasparettiTextNormalizer()
    output_df = pd.DataFrame(columns=texts.columns)

    for index, row in texts.iterrows():
        text = row["text"]
        title = row["title"]
        timevar = row["timestamp"]
        if isinstance(text, str) and timevar.isdigit():
            if "We are sorry, you need to be a subscriber" in text:
                continue
            if "Please purchase a subscription" in text:
                continue
            sentence_texts = set(
                itertools.chain.from_iterable(
                    normilizer.normalized_sentences(text)))
            sentence_title = set(
                itertools.chain.from_iterable(
                    normilizer.normalized_sentences(title)))
            if (len(sentence_texts) < 100) or (len(
                    sentence_texts.intersection(sentence_title)) == 0):
Esempio n. 6
0
                               dtype={
                                   "ID": int,
                                   "TITLE": str,
                                   "URL": str,
                                   "CATEGORY": str,
                                   "STORY": str,
                                   "HOSTNAME": str,
                                   "TIMESTAMP": int
                               })
    # news_corpora.set_index('URL', inplace=True)
    return news_corpora


if __name__ == "__main__":
    texts = read_news_corpora()
    normilizer = GasparettiTextNormalizer()

    def convert_epoch(ts):
        return datetime.utcfromtimestamp(int(ts) / 1000)

    times = defaultdict(list)
    times_epoch = defaultdict(list)
    num_errors = 0
    for index, row in texts.iterrows():
        time_epoch = row["TIMESTAMP"]
        times_epoch[row["STORY"]].append(int(time_epoch))
        try:
            date = convert_epoch(time_epoch)
        except ValueError:
            num_errors += 1
            # print("Error:", row)
Esempio n. 7
0
import pandas as pd
import numpy as np
import os
from com.expleague.media_space.topics.embedding_model import GasparettiTextNormalizer
import itertools

DATA_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "data")

if __name__ == "__main__":
    texts = pd.read_csv(os.path.join(DATA_DIR, "gasparetti_small.csv"), chunksize=1000)
    normilizer = GasparettiTextNormalizer()
    documents = list()
    if not os.path.exists(os.path.join(DATA_DIR, "bigartm", "gasparetti.txt")):
        with open(os.path.join(DATA_DIR, "bigartm", "gasparetti.txt"), "w+") as f:
            for chunk in texts:
                for index, row in chunk.iterrows():
                    text = normilizer.normalized_sentences(row["text"])
                    url = row["url"]
                    words_in_doc = itertools.chain(*text)
                    f.write(url + " " + " ".join(words_in_doc) + "\n")