def get_prediction(previous_articles: Optional[List[int]], model_name: str = 'tfidf', number_of_predictions: int = NUMBER_OF_PREDICTIONS): model = load_recommender(model_name=model_name, topic=TOPIC) # get predictions try: predictions = model.predict(X=previous_articles, n=number_of_predictions) except Exception: # provide random prediction if cannot be parsed predictions = load_raw_data(topic_name=TOPIC)[CONTENT_COLUMN].sample( number_of_predictions).index.tolist() logger.warning( f'Recommender cannot handle input: {previous_articles}. Returning random recommendations' ) return predictions
def create_data_for_plotly(X: np.ndarray, labels: List[str], model_name: str, topic: str): # get file name from input data_plotly_pickle_file = MODELS_DIR / ('pca_tsne_' + topic + '_' + model_name + '.p') # get raw input data_raw = load_raw_data(topic_name=topic) # limit input to threshold max_count = -1 X = X[:max_count, :] labels = labels[:max_count] outlets = data_raw[NEWS_OUTLET_COLUMN].values[:max_count] # It is recommended to use PCA first to reduce to ~50 dimensions pca = PCA(n_components=50) X_50 = pca.fit_transform(X) # Using TSNE to further reduce to 2 dimensions model_tsne = TSNE(n_components=2, random_state=0) Y = model_tsne.fit_transform(X_50) data = pd.DataFrame( np.concatenate( [Y, outlets.reshape(-1, 1), np.array(labels).reshape(-1, 1)], axis=1)) data = data.apply(pd.to_numeric, errors='ignore') data = pd.concat([data, data_raw[PROVENANCE_COLUMN].to_frame()], axis=1) data.columns = [ 'PC_1', 'PC_2', NEWS_OUTLET_COLUMN, TITLE_COLUMN, PROVENANCE_COLUMN ] with open(data_plotly_pickle_file, 'wb') as handle: pickle.dump(data, handle) return data
document = language_v1.Document(content=text, type_=language_v1.Document.Type.PLAIN_TEXT) annotations = client.analyze_sentiment(request={'document': document}) sentiment = annotations.document_sentiment # return result return sentiment.score, sentiment.magnitude if __name__ == '__main__': data = load_raw_data(topic_name='refugees_migration', bool_drop_special_articles=False, bool_drop_duplicates=False, bool_drop_articles_with_pattern=False, bool_drop_non_german_articles=False, bool_drop_outlier=False, bool_number_subheaders=False) data[SENTIMENT_COLUMN] = None data[MAGNITUDE_COLUMN] = None for idx, row in tqdm(data.iterrows(), total=data.shape[0]): # get text text = row[CONTENT_COLUMN] # remove html tags text = remove_html_tags(text)
from pydantic import validate_arguments from typing import List, Tuple, Optional # import custom code from renewrs.fetch_data import load_raw_data, read_json, add_html_tag from renewrs.train import load_recommender from renewrs.config import CONTENT_COLUMN, NUMBER_OF_PREDICTIONS, MODELS_DIR, SOURCE_FILENAME, TITLE_COLUMN from renewrs.util.logger import setup_logging from renewrs.experiment.settings import MODEL, TOPIC # initialize logging logger = setup_logging(name=Path(__file__).name) # get all articles data = load_raw_data(topic_name=TOPIC) articles = data[CONTENT_COLUMN] def get_random_articles(n=3) -> pd.Series: return articles.sample(n) def get_random_article_with_id() -> Tuple[int, str]: # get sample article as series article = articles.sample(1) # return id and article return (article.index, article.values[0])
if __name__ == '__main__': logger = setup_logging(Path(__file__).name) logger.info('Getting args') args = get_parsed_args() if args.create: logger.info('Getting model') model = load_recommender(model_name=args.model, topic=args.topic) logger.info('Getting embedding and indices') data = load_raw_data(topic_name=args.topic) embedding = model.X_ labels = model.labels.to_list() logger.info('Creating data for plotly') data2plot = create_data_for_plotly(X=embedding, labels=labels, model_name=args.model, topic=args.topic) else: logger.info('Getting data for plotly') data2plot = load_pca_tsne_data(model_name=args.model, topic=args.topic)
'--topic', default='refugees_migration', type=str, help='topic choice') # get args args = parser.parse_args() topic = args.topic model_name = args.model lda = args.lda logger.info(f'Training on {topic}') # get data data = load_raw_data(topic_name=topic) if lda: # process all documents in raw data data_processed = preprocess_raw_data_for_lda(data) # get tokenized texts logger.info('Getting tokens') text_tokenized = [ doc.split() for doc in data_processed[CONTENT_COLUMN] ] # create corpus and dictionary logger.info('Creating dictionary and corpus') dictionary = gensim.corpora.Dictionary(text_tokenized) corpus = [dictionary.doc2bow(text) for text in text_tokenized]