def click_run_button(self): text = self.editor sentence_now = self.editor.toPlainText() print(sentence_now) result = predict(sentence_now, self.trainedModel, write_to_csv=False, path=None) self.click_iter += len(result['prediction']) for i in range(len(result['prediction'])): sentence = QStandardItem() sentence.setText(result['sentence'][i]) sentence.setColumnCount(1) prediction = QStandardItem() prediction_value = result['prediction'][i] prediction.setText(prediction_value) prediction.setColumnCount(2) sentiment_score = QStandardItem() sentiment_score_value = str(result['sentiment_score'][i].item()) sentiment_score.setText(sentiment_score_value) sentiment_score.setColumnCount(3) self.model.appendRow([sentence, prediction, sentiment_score]) self.listView.setModel(self.model)
def test_predict(): from transformers import AutoModelForSequenceClassification sys.path.append(r'./deps/finBERT') from finbert.finbert import predict model_path = 'deps/finBERT/models/classifier_model/finbert-sentiment' model = AutoModelForSequenceClassification.from_pretrained(model_path,num_labels=3,cache_dir=None) sentimenter = CryptoSentimenter() sentimenter.scan('/r/cryptomarkets') #sentimenter.scan('/r/cryptocurrency') #sentimenter.scan('/r/cryptocurrencies') #sentimenter.scan('/r/cryptomoonshots') #sentimenter.scan('/r/satoshistreetbets') df = sentimenter.get_dataframe() # Testing first few sentences text = '\n'.join(list(df['text'])[:10]) results = predict(text, model) results.to_csv('finbert_results.csv') #print('results {}'.format(results)) print('Writing sentiment_summary.csv') with open('sentiment_summary.csv', 'w+') as f: df.to_csv(f)
def apply(input, output): if isinstance(input, dict): text = str(input['text']) result = finbert.predict(text, output['classification_model']) blob = TextBlob(text) result['textblob_prediction'] = [ sentence.sentiment.polarity for sentence in blob.sentences ] resp = { "text": text, "ndarray": result.logit.mean().tolist(), "names": ["Positive", "Negative", "Neutral"] } md5 = hashlib.md5() md5.update(text.encode()) filename = str(md5.hexdigest()) + '.json' with output['pach_client'].commit("raw_data", "master") as commit: output['pach_client'].put_file_bytes( commit, filename, json.dumps(format_prediction(resp)).encode('utf-8')) return resp else: raise Exception('input must be a json object.')
def get_sentiment(stock_ticker): """ This function extracts the last 100 headlines of a given stock from finviz.com and then analyses their sentiment using the finBERT sentiment analyser. Parameters ---------- stock_ticker: Stock ticker string (ex: NRZ) Returns ------- finbert_sentiment: The sentiment score of the headlines listed in finviz.com for a given stock ticker """ stock_ticker = re.sub(r"\.", "-", stock_ticker) finviz_url = 'https://finviz.com/quote.ashx?t=' news_tables = {} url = finviz_url + stock_ticker req = Request(url=url, headers={'user-agent': 'my-app'}) response = urlopen(req) html = BeautifulSoup(response, features='html.parser') news_table = html.find(id='news-table') news_tables[stock_ticker] = news_table parsed_data = [] for stock_ticker, news_table in news_tables.items(): for row in news_table.findAll('tr'): title = row.a.text date_data = row.td.text.split(' ') if len(date_data) == 1: time = date_data[0] else: date = date_data[0] time = date_data[1] parsed_data.append([stock_ticker, date, time, title]) df = pd.DataFrame(parsed_data, columns=['stock_ticker', 'date', 'time', 'title']) model = AutoModelForSequenceClassification.from_pretrained("pytorch_model") bert_sentiment = lambda x: predict(x, AutoModelForSequenceClassification.from_pretrained("pytorch_model"))['sentiment_score'].mean() headlines = '. '.join(df['title'].tolist()) finbert_sentiment = bert_sentiment(headlines) return(finbert_sentiment)
def predict_batch(N, data_path="CC_data/", save_path="output/"): model = BertForSequenceClassification.from_pretrained(args.model_path, num_labels=3, cache_dir=None) sentence_pred_df = [] start_main = time.time() data = pickle.load(open(data_path + "BERTnews_all.p", "rb")) data = data.reset_index(drop=True) # for i in range(len(data)): for i in range(N): pred = predict(data.loc[i]['text'], data.loc[i]['index'], model, write_to_csv=False) sentence_pred_df.extend(pred) sentence_pred_df = pd.DataFrame.from_dict(sentence_pred_df) sentence_pred_df.to_csv(save_path + "BERTnews_preds.csv") end_main = time.time() print("TIME for batch_id: {}".format(round(end_main - start_main, 2)))
def item_sentiment_score(item_txt): if item_txt == "missing items": return item_txt else: item_txt = item_txt.replace('\n', ' ') model = BertForSequenceClassification.from_pretrained( 'models/classifier_model/finbert-sentiment', num_labels=3, cache_dir=None) sentiment_df = predict(item_txt, model, write_to_csv=False) full_item_txt = " ".join(sentiment_df['sentence']) try: w_sentiment = pd.DataFrame(summarize(full_item_txt, ratio=1.0, scores=True), columns=['sentence', 'w']) sentiment_df = sentiment_df.merge(w_sentiment, how='inner', on='sentence') if len(sentiment_df) > 1: sentiment_df['w'] = sentiment_df['w'] / np.sum( sentiment_df['w']) return np.sum(sentiment_df['w'] * sentiment_df['sentiment_score']) else: ## nothing to join on return "join return empty except title row" except Exception as e: print(e) if len(full_item_txt) > 0: return "summa won't rank" else: return "full_text is empty" pass
from pathlib import Path import datetime import os parser = argparse.ArgumentParser(description='Sentiment analyzer') parser.add_argument('-a', action="store_true", default=False) parser.add_argument('--text_path', type=str, help='Path to the text file.') parser.add_argument('--output_dir', type=str, help='Where to write the results') parser.add_argument('--model_path', type=str, help='Path to classifier model') args = parser.parse_args() if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) with open(args.text_path, 'r') as f: text = f.read() model = BertForSequenceClassification.from_pretrained(args.model_path, num_labels=3, cache_dir=None) #now = datetime.datetime.now().strftime("predictions_%B-%d-%Y-%I:%M.csv") output = Path(args.text_path).stem + '_predictions.csv' predict(text, model, write_to_csv=True, path=os.path.join(args.output_dir, output))
def score(): text = request.get_json()['text'] return (predict(text, model).to_json(orient='records'))
def score(): text = request.get_json()['text'] result_df = predict(text, model) print(result_df) return (result_df.to_json(orient='records'))
"Remove numbers from the text" return text if not os.path.exists(outputpath): os.mkdir(outputpath) model = BertForSequenceClassification.from_pretrained(model_path,num_labels=3,cache_dir=None) #now = datetime.datetime.now().strftime("predictions_%B-%d-%Y-%I:%M.csv") sections = ['item1a', 'item7'] for file in files: df = pd.read_csv(inputpath+file) for i in range(len(df)): row = df.iloc[i] filename = row['Ticker'] + '_' + str(int(row["Year"]) +2000) + ".json" filepath = os.path.join(outputpath, filename) if os.path.exists(filepath): continue out = {} for section in sections: text = row[section] if type(text) == str: text = preprocessText(text) score = predict(text,model) out[section] = score else: break if len(out.keys())==2: f = open(filepath, "w+") f.write(json.dumps(out)) f.close()
print('Getting list of model directories...') models_list = get_models(args) print(models_list) make_output_directory(args.output_dir, get_models(args)) for model_path in models_list: model_dir = os.path.join(args.model_dir, model_path) print('Loading {} model....'.format(model_path)) model = AutoModelForSequenceClassification.from_pretrained(model_dir,num_labels=3,cache_dir=None) print('{} model loaded. Processing .csv files...'.format(model_path)) output_dir = os.path.join(args.output_dir, model_path) for df_filename in sentences_df_list: print('Processing {} using {} model...'.format(df_filename, model_path)) df = pd.read_csv(os.path.join(args.input_dir, df_filename)) text_inputs = df['sentence'].tolist() final_df = predict(text_inputs,model,write_to_csv=False) processed_csv_filename = '{}_{}'.format(model_path, df_filename) print('Finished processing {}. Saving as .csv file named {}...'.format(df_filename, processed_csv_filename)) final_df.to_csv(os.path.join(output_dir, processed_csv_filename), index=False) # if not os.path.exists(args.output_dir): # os.mkdir(args.output_dir) # if args.text_path.endswith('.txt'): # with open(args.text_path,'r') as f: # text = f.read() # text_inputs = text.strip().split('\n\n') # elif args.text_path.endswith('.csv'): # df = pd.read_csv(args.text_path) # text_inputs = df['sentence'].tolist()
def predict_news(x): pred = predict(x[1], x[0], model, write_to_csv=False) return pred