Ejemplo n.º 1
0
def get_prediction(previous_articles: Optional[List[int]],
                   model_name: str = 'tfidf',
                   number_of_predictions: int = NUMBER_OF_PREDICTIONS):

    model = load_recommender(model_name=model_name, topic=TOPIC)

    # get predictions
    try:

        predictions = model.predict(X=previous_articles,
                                    n=number_of_predictions)

    except Exception:

        # provide random prediction if cannot be parsed
        predictions = load_raw_data(topic_name=TOPIC)[CONTENT_COLUMN].sample(
            number_of_predictions).index.tolist()
        logger.warning(
            f'Recommender cannot handle input: {previous_articles}. Returning random recommendations'
        )

    return predictions
Ejemplo n.º 2
0
def create_data_for_plotly(X: np.ndarray, labels: List[str], model_name: str,
                           topic: str):

    # get file name from input
    data_plotly_pickle_file = MODELS_DIR / ('pca_tsne_' + topic + '_' +
                                            model_name + '.p')

    # get raw input
    data_raw = load_raw_data(topic_name=topic)

    # limit input to threshold
    max_count = -1
    X = X[:max_count, :]
    labels = labels[:max_count]
    outlets = data_raw[NEWS_OUTLET_COLUMN].values[:max_count]

    # It is recommended to use PCA first to reduce to ~50 dimensions
    pca = PCA(n_components=50)
    X_50 = pca.fit_transform(X)

    # Using TSNE to further reduce to 2 dimensions
    model_tsne = TSNE(n_components=2, random_state=0)
    Y = model_tsne.fit_transform(X_50)

    data = pd.DataFrame(
        np.concatenate(
            [Y, outlets.reshape(-1, 1),
             np.array(labels).reshape(-1, 1)],
            axis=1))
    data = data.apply(pd.to_numeric, errors='ignore')
    data = pd.concat([data, data_raw[PROVENANCE_COLUMN].to_frame()], axis=1)
    data.columns = [
        'PC_1', 'PC_2', NEWS_OUTLET_COLUMN, TITLE_COLUMN, PROVENANCE_COLUMN
    ]

    with open(data_plotly_pickle_file, 'wb') as handle:
        pickle.dump(data, handle)

    return data
    document = language_v1.Document(content=text,
                                    type_=language_v1.Document.Type.PLAIN_TEXT)
    annotations = client.analyze_sentiment(request={'document': document})
    sentiment = annotations.document_sentiment

    # return result
    return sentiment.score, sentiment.magnitude


if __name__ == '__main__':

    data = load_raw_data(topic_name='refugees_migration',
                         bool_drop_special_articles=False,
                         bool_drop_duplicates=False,
                         bool_drop_articles_with_pattern=False,
                         bool_drop_non_german_articles=False,
                         bool_drop_outlier=False,
                         bool_number_subheaders=False)

    data[SENTIMENT_COLUMN] = None
    data[MAGNITUDE_COLUMN] = None

    for idx, row in tqdm(data.iterrows(), total=data.shape[0]):

        # get text
        text = row[CONTENT_COLUMN]

        # remove html tags
        text = remove_html_tags(text)
Ejemplo n.º 4
0
from pydantic import validate_arguments
from typing import List, Tuple, Optional

# import custom code
from renewrs.fetch_data import load_raw_data, read_json, add_html_tag
from renewrs.train import load_recommender
from renewrs.config import CONTENT_COLUMN, NUMBER_OF_PREDICTIONS, MODELS_DIR, SOURCE_FILENAME, TITLE_COLUMN
from renewrs.util.logger import setup_logging

from renewrs.experiment.settings import MODEL, TOPIC

# initialize logging
logger = setup_logging(name=Path(__file__).name)

# get all articles
data = load_raw_data(topic_name=TOPIC)
articles = data[CONTENT_COLUMN]


def get_random_articles(n=3) -> pd.Series:
    return articles.sample(n)


def get_random_article_with_id() -> Tuple[int, str]:

    # get sample article as series
    article = articles.sample(1)

    # return id and article
    return (article.index, article.values[0])
Ejemplo n.º 5
0

if __name__ == '__main__':

    logger = setup_logging(Path(__file__).name)

    logger.info('Getting args')
    args = get_parsed_args()

    if args.create:

        logger.info('Getting model')
        model = load_recommender(model_name=args.model, topic=args.topic)

        logger.info('Getting embedding and indices')
        data = load_raw_data(topic_name=args.topic)

        embedding = model.X_
        labels = model.labels.to_list()

        logger.info('Creating data for plotly')
        data2plot = create_data_for_plotly(X=embedding,
                                           labels=labels,
                                           model_name=args.model,
                                           topic=args.topic)

    else:

        logger.info('Getting data for plotly')
        data2plot = load_pca_tsne_data(model_name=args.model, topic=args.topic)
Ejemplo n.º 6
0
                        '--topic',
                        default='refugees_migration',
                        type=str,
                        help='topic choice')

    # get args
    args = parser.parse_args()

    topic = args.topic
    model_name = args.model
    lda = args.lda

    logger.info(f'Training on {topic}')

    # get data
    data = load_raw_data(topic_name=topic)

    if lda:
        # process all documents in raw data
        data_processed = preprocess_raw_data_for_lda(data)

        # get tokenized texts
        logger.info('Getting tokens')
        text_tokenized = [
            doc.split() for doc in data_processed[CONTENT_COLUMN]
        ]

        # create corpus and dictionary
        logger.info('Creating dictionary and corpus')
        dictionary = gensim.corpora.Dictionary(text_tokenized)
        corpus = [dictionary.doc2bow(text) for text in text_tokenized]