import demoji import parser demoji.download_codes() def test_is_valid_dollar_sign_match(): a = "$GME" b = parser.is_dollar_sign_match(a) assert (b == True) def test_not_valid_dollar_sign_match(): a = "GME" b = parser.is_dollar_sign_match(a) assert (b == False) def test_not_valid_dollar_sign_match_with_call_abbreviation(): a = "$65C" b = parser.is_dollar_sign_match(a) assert (b == False) def test_not_valid_dollar_sign_match_with_put_abbreviation(): a = "$50P" b = parser.is_dollar_sign_match(a) assert (b == False) def test_preprocess_and_split_text_with_emojis():
def get_tweets(subject): logger.info('-- Start retrieving tweets') with open('credentials.json') as json_file: data = json.load(json_file) bearer_token = data['bearer_token'] token = {'access_token': bearer_token, 'token_type': 'bearer'} auth = OAuth2(token=token) if '#' in subject: subject = subject.replace('#', '%23') url = f'https://api.twitter.com/2/tweets/search/recent?query={subject}+lang:fr+-is:retweet&max_results=100' r = req.get(url, auth=auth) txt = json.loads(r.text) data = txt["data"] for _ in range(10): if "next_token" not in txt["meta"]: break next_token = txt["meta"]["next_token"] url = f'https://api.twitter.com/2/tweets/search/recent?query={subject}+lang:fr+-is:retweet&max_results=100' \ f'&next_token={next_token}' r = req.get(url, auth=auth) txt = json.loads(r.text) data += txt["data"] dataset = [] textt = [] demoji.download_codes() for tweet in data: textt.append(tweet["text"]) txt = nlp_pipeline(tweet["text"]) dataset.append(txt) #word cloud stop_words = set(STOPWORDS) with open('stop_words_french.json', encoding='utf-8') as json_file: stop_words_french = json.load(json_file) stop_words.update(stop_words_french) stop_words.add(subject.replace('%23', '')) stop_words.add(subject.replace('%23', '').lower()) wordcloud = WordCloud( background_color='white', stopwords=stop_words, max_words=200, max_font_size=40, scale=3, random_state=1 # chosen at random by flipping a coin; it was heads ).generate(str(dataset)) plt.figure(1, figsize=(12, 12)) plt.axis('off') plt.imshow(wordcloud) plt.show() logger.info(f'--- Get {len(dataset)} tweets') return dataset
def __init__(self): demoji.download_codes() pass
def test_download(): assert demoji.download_codes() is None assert type(demoji._EMOJI_PAT) == type(re.compile("")) # noqa assert isinstance(demoji._CODE_TO_DESC, dict) assert os.path.isfile(demoji.CACHEPATH)
def __init__(self): """ Constructor initializing the attributes """ demoji.download_codes()
import ast import csv from csv import DictWriter import re import pandas as pd import demoji import os demoji.download_codes() # for downloading demoji cache df = pd.read_csv("Data/Found_Data/arifhosentamim.csv") result = "" read = "" f = open("insomnia_no_usa_output.csv", "w", newline='', encoding='utf-8') fieldnames = ['tweet_id', 'label', 'tweets'] writer: DictWriter = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() files = os.listdir("Data/Found_Data/") print(files) for file in files: data = pd.read_csv("Data/Found_Data/" + file) tweet = [] for index, row in data.iterrows(): row['tweet'] = ast.literal_eval(row['tweet']) row['tweet'] = (row['tweet'].decode() if isinstance( row['tweet'], bytes) else row['tweet']).strip() row[2] = re.sub(r'@\S+', '', row[2]) # Remove mentions row[2] = re.sub(r'#\S+', '', row[2]) # Remove hashtags row[2] = re.sub(r'RT', '', row[2]) row[2] = re.sub(r'http\S+', '', row[2]) # Remove urls
def main(): source = input("Enter the Source: ") sources = ["amazon", "facebook", "twitter"] if source not in sources: logger.error("Please enter the source as either <amazon> or <facebook> or <twitter>") print("Please enter the source as either <amazon> or <facebook> or <twitter>") raise SystemExit(sys.exit(1)) #phases - function to validate the input the rerun status def checkInputStatus(inputoption): options=[0,1] if inputoption not in options: print("Please enter the option as either 0 or 1") raise SystemExit(sys.exit(1)) return 1 #phases - Get the rerun status from user external_data_flag = int(input("Want to upload the data externally?: ")) checkInputStatus(external_data_flag) rerun = int(input("Enter the Re-run status 0/1: ")) checkInputStatus(rerun) #phases - Get the processing options status print("\n---------------- Enter the processing options ---------------- ") scrape = None if external_data_flag != 1: scrape = int(input("\nDo you want to process Scraping 0/1: ")) checkInputStatus(scrape) preproc = int(input("\nDo you want to process Pre processing 0/1: ")) checkInputStatus(preproc) feature = int(input("\nDo you want to process Feature Extraction 0/1: ")) checkInputStatus(feature) clustering = int(input("\nDo you want to process Clustering 0/1: ")) checkInputStatus(clustering) visual = int(input("\nDo you want to process Visualization 0/1: ")) checkInputStatus(visual) #phases - Validate the processing options status if external_data_flag != 1: processoption=str(scrape)+str(preproc)+str(feature)+str(clustering)+str(visual) else: processoption = str(preproc) + str(feature) + str(clustering) + str(visual) if processoption in ['00000']: print("\n Not proceeding with any processing------------ END ") raise SystemExit(sys.exit(1)) if processoption in ['11111'] and rerun in [1]: print("\n As rerun option is 1, cannot execute all processing------------ END ") raise SystemExit(sys.exit(1)) #phases - function to denote the END of processing def endprocess(): logger.info("Total elapsed time: {0}".format(time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time)))) logger.info("End...!!!") raise SystemExit(sys.exit(1)) config = config_ini() """ Pre-requisites """ global keyword, data_tw_post, tw_data_pped, nlp_server, tw_data_emotions, tw_data_clustering driver = None if not source == "twitter": if external_data_flag != 1: driver = webdriver.Chrome(executable_path=config['PATHS']['CHROME_DRIVER']) demoji.download_codes() stanfordnlp_loc = config['PATHS']['SUPPORTING_FILES'] + '\\stanford-corenlp-full-2018-10-05' +"\\" cmd = "java -mx4g -cp " + '"*"' + " edu.stanford.nlp.pipeline.StanfordCoreNLPServer" nlp_server = subprocess.Popen(cmd, cwd=stanfordnlp_loc) spacy_nlp = spacy.load('en_core_web_sm') spacy_nlp.add_pipe(LanguageDetector(), name="language_detector", last=True) if not os.path.isdir(config['PATHS']['SUPPORTING_FILES'] + '\\en_ewt_models'): stanfordnlp.download('en', resource_dir=config['PATHS']['SUPPORTING_FILES']) demoji.download_codes() try: if not any([os.path.isdir(nltk.data.find('tokenizers/punkt')),os.path.isdir(nltk.data.find('corpora/stopwords'))]): pass except LookupError as e: nltk.download('punkt') nltk.download('stopwords') sentiment_nlp = StanfordCoreNLP('http://*****:*****@role = 'button']") LogInButton.click() username = driver.find_element_by_id("m_login_email") username.clear() username.send_keys(int(config['FB_LOGINS']['CONTACTNO'])) password = driver.find_element_by_id("m_login_password") password.clear() password.send_keys(config['FB_LOGINS']['PASSWORD']) driver.find_element_by_name("login").click() time.sleep(7) fbpostforcomments = copy.deepcopy(data_fb_post) fbpostforcomments = fbpostforcomments.dropna(subset=['post_url']) fb_comments = fbpostforcomments['post_url'].apply(lambda x: fbcomments.scrapeFbComments(x, driver)) fb_reviews = pd.concat([r for r in fb_comments], ignore_index=True) fb_reviews = pd.merge(fb_reviews, data_fb_post, left_on='post', right_on='post_url', how="left") fb_reviews = fb_reviews.drop_duplicates(subset='commentWithAuthorname') logger.info("--------------------- Scraping is Completed...!!! -------------------------") logger.info("Exporting as csv into Output Path. Please wait...!!!") fb_reviews.to_csv(config['PATHS']['BASEDIR'] + "\\outputs\\fb_input_data_" + keyword + ".csv", index=False) if processoption in ['10000']: endprocess() """ Data Pre-Processing """ if preproc in [1]: logger.info("------------- Data Pre-processing is Initiated. Please wait...!!! ---------") if scrape not in [1]: try: if external_data_flag != 1: fb_reviews=pd.read_csv(config['PATHS']['BASEDIR'] + "\\outputs\\fb_input_data_" + keyword + ".csv") else: keyword = input("Enter the Keyword: ") file_upload = easygui.fileopenbox() fb_reviews = pd.read_csv(file_upload) except Exception as e: print("\n Scraping output file is not available at the mentioned path") logger.error("Exception: {}".format(e)) fb_data_pp = preprocReviews.fbPreProcess(fb_reviews, spacy_nlp) fb_data_pped = preprocReviews.create_final_input(fb_data_pp, demoji) logger.info("---------------- Data Pre-processing is Completed...!!! -------------------") logger.info("Exporting as csv into Output Path. Please wait...!!!") fb_data_pped.to_csv(config['PATHS']['BASEDIR'] + "\\outputs\\fb_data_preprocessed_" + keyword + ".csv", index=False) if processoption[-3:] in ['000']: endprocess() """ Features Extraction: Sentiments """ if feature in [1]: logger.info("-------------------------- Features Extraction ----------------------------") logger.info("Sentiments Extraction is in Progress. Please wait..!!!") if preproc not in [1]: try: fb_data_pped = pd.read_csv(config['PATHS']['BASEDIR'] + "\\outputs\\fb_data_preprocessed_" + keyword + ".csv") except Exception as e: print("\n Data processing output file is not available at the mentioned path") logger.error("Exception: {}".format(e)) fb_data_sentiment = copy.deepcopy(fb_data_pped) fb_data_sentiment['sentiment_new'] = fb_data_sentiment['sentence'].apply( lambda x: sentiments.extract_sentiment(x, sentiment_nlp)) logger.info("Sentiments Extraction is Completed...!!!") logger.info("Exporting as csv into Output Path. Please wait...!!!") fb_data_sentiment.to_csv(config['PATHS']['BASEDIR'] + "\\outputs\\fb_data_sentiments_" + keyword + ".csv", index=False) """ Features Extraction: Themes """ logger.info("Themes Extraction is in Progress. Please wait..!!!") fb_data_themes = copy.deepcopy(fb_data_sentiment) fb_data_themes = themes.tag_themes(fb_data_themes, spacy_nlp, nlp) logger.info("Themes Extraction is Completed...!!!") logger.info("Exporting as csv into Output Path. Please wait...!!!") fb_data_themes.to_csv(config['PATHS']['BASEDIR'] + "\\outputs\\fb_data_themes_" + keyword + ".csv", index=False) """ Features Extraction: Emotions """ logger.info("Emotions Extraction is in Progress. Please wait..!!!") english_stopwords = stopwords.words('english') fb_data_emotions = emotions.tag_emotions(fb_data_themes, english_stopwords, nlp) logger.info("Emotions Extraction is Completed...!!!") logger.info("Exporting as csv into Output Path. Please wait...!!!") fb_data_emotions.to_csv(config['PATHS']['BASEDIR'] + "\\outputs\\fb_data_emotions_" + keyword + ".csv", index=False) logger.info("------------------ Features Extraction is Completed...!!! -----------------") if processoption[-2:] in ['00']: endprocess() nlp_server.kill() """ Features Extraction: Clustering """ if clustering in [1]: logger.info("--------------- Clustering is in Progress. Please wait...!!! ---------------") themes_map_data = pd.read_csv(config['PATHS']['BASEDIR'] + "\\common_files\\theme_mapping.csv", error_bad_lines=False, encoding='ISO-8859-1') emotions_map_data = pd.read_csv(config['PATHS']['BASEDIR'] + "\\common_files\\emotion_mapping.csv", error_bad_lines=False, encoding='ISO-8859-1') if feature not in [1]: try: fb_data_emotions = pd.read_csv(config['PATHS']['BASEDIR'] + "\\outputs\\fb_data_emotions_" + keyword + ".csv") except Exception as e: print("\n Emotion extraction output file is not available at the mentioned path") logger.error("Exception: {}".format(e)) fb_data_clustering = cluster.cluster_theme_keywords(fb_data_emotions, themes_map_data) fb_data_clustering = cluster.cluster_emotion_keywords(fb_data_clustering, emotions_map_data) logger.info("Clustering is Completed...!!!") logger.info("Exporting as csv into Output Path. Please wait...!!!") fb_data_clustering.to_csv(config['PATHS']['BASEDIR'] + "\\outputs\\fb_data_clustering_" + keyword + ".csv", index=False) logger.info("-------------------- Clustering is Completed...!!! --------------------------") if processoption[-1:] in ['0']: endprocess() """ Visualization """ if visual in [1]: logger.info("------------------- Visualization is Initiated. Please wait...!!! -----------") features = ["themes_keyword", "emotion_keyword", "theme_groups", "emotion_groups"] feature_groups = ["theme_groups", "emotion_groups"] feature1 = feature_groups[0] feature2 = feature_groups[1] brand = keyword if clustering not in [1]: try: fb_data_clustering = pd.read_csv(config['PATHS']['BASEDIR'] + "\\outputs\\fb_data_clustering_" + keyword + ".csv") except Exception as e: print("\n Clustering output file is not available at the mentioned path") logger.error("Exception: {}".format(e)) viz_data = copy.deepcopy(fb_data_clustering) viz_data = viz_data.loc[viz_data[feature1].notnull(), :] viz_data.index = range(len(viz_data)) viz.plotWordCloud(config=config, source=source, data=viz_data, brand=brand, features=features) #viz.frequencyBubblePlot(config=config, source=source, data=viz_data, brand=brand, features=features) viz.fitModelAndDraw(config=config, source=source, data=viz_data, __title__=feature1 + 'v/s' + feature2, brand=brand, feature1=feature1, feature2=feature2) viz.contigencyTable(config=config, source=source, data=viz_data, brand=brand, feature1=feature1, feature2=feature2) viz.frequencyDistribution(config=config, source=source, features=features, data=viz_data, brand=brand) logger.info("-------------------- Visualization Completed...!!! --------------------------") logger.info("Total elapsed time: {0}".format(time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time)))) logger.info("End...!!!") except Exception as e: nlp_server.kill() logger.error("Exception: {}".format(e)) elif source == "amazon": try: """ Data Scrapping """ if external_data_flag != 1: keyword = input("Enter the Keyword: ") if scrape in [1]: logger.info("---------------- Scrapping is Initiated. Please wait...!!! ----------------") rev_lnk_scrp = int(input("\nDo you want to upload the review links externally 0/1: ")) checkInputStatus(rev_lnk_scrp) if rev_lnk_scrp in [1]: review_link_df = pd.read_csv(config['PATHS']['BASEDIR'] + "\\common_files\\review_link.csv", error_bad_lines=False) else: logger.info("---------------- Review link extraction is Initiated. Please wait...!!! ----------------") review_link_df1 = amzreviewlinkscrapper.getreview_link(keyword) logger.info("---------------- Review link extraction is completed. ----------------") review_link_df = review_link_df1.rename(columns = {"review_links":"Review_Link_Href","total_review_count":"Review_Count","product_name":"Name"}) review_link_df = review_link_df.drop_duplicates(subset='Review_Link_Href') review_link_df = review_link_df.dropna(subset=['Review_Link_Href'], axis=0) review_link_df['linkset'] = review_link_df.apply(amzscraper.create_linkset, axis=1) review_link_df['linkset2'] = review_link_df['linkset'].apply(lambda x: '|'.join(x)) all_links_df = review_link_df['linkset2'].str.split("|", expand=True) total_number_of_pages = len(all_links_df.columns) logger.info("Total no. of Review-Links Scraped: {}".format(len(all_links_df))) review_link_df = pd.concat([review_link_df, all_links_df], axis=1) review_link_df = pd.melt(review_link_df, id_vars=['Name', 'Review_Link_Href','Review_Count', 'linkset', 'linkset2'], value_vars=list(range(0, total_number_of_pages)), value_name='Final_link') review_link_df = review_link_df.sort_values(by=['Review_Link_Href', 'variable'], ascending=[True, True]) review_link_df1 = review_link_df[review_link_df['Final_link'].isna() == False] list_dataframe = review_link_df1['Final_link'].apply(lambda x: amzscraper.scrap_reviews(x, driver)) reviews_df_stacked = pd.concat([r for r in list_dataframe], ignore_index=True) amz_reviews_data = pd.merge(reviews_df_stacked, review_link_df1, left_on='review_link', right_on='Final_link', how="left") amz_reviews_data = amz_reviews_data.sort_values(by=['Review_Link_Href', 'Final_link'], ascending=[True, True]) logger.info("--------------------- Scraping is Completed...!!! -------------------------") logger.info("Exporting as csv into Output Path. Please wait...!!!") amz_reviews_data.to_csv(config['PATHS']['BASEDIR'] + "\\outputs\\amz_input_data_" + keyword + ".csv", index=False) if processoption in ['10000']: endprocess() """ Data Pre-Processing """ if preproc in [1]: logger.info("------------- Data Pre-processing is Initiated. Please wait...!!! ---------") if scrape not in [1]: try: if external_data_flag != 1: amz_reviews_data=pd.read_csv(config['PATHS']['BASEDIR'] + "\\outputs\\amz_input_data_" + keyword + ".csv") else: keyword = input("Enter the Keyword: ") file_upload = easygui.fileopenbox() amz_reviews_data = pd.read_csv(file_upload) except Exception as e: print("\n Scraping output file is not available at the mentioned path") logger.error("Exception: {}".format(e)) amz_data_pp = preprocReviews.amazonPreProcess(amz_reviews_data, spacy_nlp) amz_data_pped = preprocReviews.create_final_input(amz_data_pp, demoji) logger.info("---------------- Data Pre-processing is Completed...!!! -------------------") logger.info("Exporting as csv into Output Path. Please wait...!!!") amz_data_pped.to_csv(config['PATHS']['BASEDIR'] + "\\outputs\\amz_data_preprocessed_" + keyword + ".csv", index=False) if processoption[-3:] in ['000']: endprocess() """ Features Extraction: Sentiments """ if feature in [1]: logger.info("-------------------------- Features Extraction ----------------------------") logger.info("Sentiments Extraction is in Progress. Please wait..!!!") if preproc not in [1]: try: amz_data_pped = pd.read_csv(config['PATHS']['BASEDIR'] + "\\outputs\\amz_data_preprocessed_" + keyword + ".csv") except Exception as e: print("\n Data processing output file is not available at the mentioned path") logger.error("Exception: {}".format(e)) amz_data_sentiment = copy.deepcopy(amz_data_pped) amz_data_sentiment['sentiment_new'] = amz_data_sentiment['sentence'].apply( lambda x: sentiments.extract_sentiment(x, sentiment_nlp)) logger.info("Sentiments Extraction is Completed...!!!") logger.info("Exporting as csv into Output Path. Please wait...!!!") amz_data_sentiment.to_csv(config['PATHS']['BASEDIR'] + "\\outputs\\amz_data_sentiments_" + keyword + ".csv", index=False) """ Features Extraction: Themes """ logger.info("Themes Extraction is in Progress. Please wait..!!!") amz_data_themes = copy.deepcopy(amz_data_sentiment) amz_data_themes = themes.tag_themes(amz_data_themes, spacy_nlp, nlp) logger.info("Themes Extraction is Completed...!!!") logger.info("Exporting as csv into Output Path. Please wait...!!!") amz_data_themes.to_csv(config['PATHS']['BASEDIR'] + "\\outputs\\amz_data_themes_" + keyword + ".csv", index=False) """ Features Extraction: Emotions """ logger.info("Emotions Extraction is in Progress. Please wait..!!!") english_stopwords = stopwords.words('english') amz_data_emotions = emotions.tag_emotions(amz_data_themes, english_stopwords, nlp) logger.info("Emotions Extraction is Completed...!!!") logger.info("Exporting as csv into Output Path. Please wait...!!!") amz_data_emotions.to_csv(config['PATHS']['BASEDIR'] + "\\outputs\\amz_data_emotions_" + keyword + ".csv", index=False) logger.info("------------------ Features Extraction is Completed...!!! -----------------") if processoption[-2:] in ['00']: endprocess() nlp_server.kill() """ Features Extraction: Clustering """ if clustering in [1]: logger.info("--------------- Clustering is in Progress. Please wait...!!! ---------------") themes_map_data = pd.read_csv(config['PATHS']['BASEDIR'] + "\\common_files\\theme_mapping.csv", error_bad_lines=False, encoding='ISO-8859-1') emotions_map_data = pd.read_csv(config['PATHS']['BASEDIR'] + "\\common_files\\emotion_mapping.csv", error_bad_lines=False, encoding='ISO-8859-1') if feature not in [1]: try: amz_data_emotions = pd.read_csv(config['PATHS']['BASEDIR'] + "\\outputs\\amz_data_emotions_" + keyword + ".csv") except Exception as e: print("\n Emotion extraction output file is not available at the mentioned path") logger.error("Exception: {}".format(e)) amz_data_clustering = cluster.cluster_theme_keywords(amz_data_emotions, themes_map_data) amz_data_clustering = cluster.cluster_emotion_keywords(amz_data_clustering, emotions_map_data) logger.info("-------------------- Clustering is Completed...!!! --------------------------") logger.info("Exporting as csv into Output Path. Please wait...!!!") amz_data_clustering.to_csv(config['PATHS']['BASEDIR'] + "\\outputs\\amz_data_clustering_" + keyword + ".csv", index=False) if processoption[-1:] in ['0']: endprocess() """ Visualization """ if visual in [1]: logger.info("------------------- Visualization is Initiated. Please wait...!!! -----------") features = ["themes_keyword", "emotion_keyword", "theme_groups", "emotion_groups"] feature_groups = ["theme_groups", "emotion_groups"] feature1 = feature_groups[0] feature2 = feature_groups[1] brand = keyword if clustering not in [1]: try: amz_data_clustering = pd.read_csv(config['PATHS']['BASEDIR'] + "\\outputs\\amz_data_clustering_" + keyword + ".csv") except Exception as e: print("\n Clustering output file is not available at the mentioned path") logger.error("Exception: {}".format(e)) viz_data = copy.deepcopy(amz_data_clustering) viz_data = viz_data.loc[viz_data[feature1].notnull(), :] viz_data.index = range(len(viz_data)) viz.plotWordCloud(config=config, source=source, data=viz_data, brand=brand, features=features) #viz.frequencyBubblePlot(config=config, source=source, data=viz_data, brand=brand, features=features) viz.fitModelAndDraw(config=config, source=source, data=viz_data, __title__=feature1 + 'v/s' + feature2, brand=brand, feature1=feature1, feature2=feature2) viz.contigencyTable(config=config, source=source, data=viz_data, brand=brand, feature1=feature1, feature2=feature2) viz.frequencyDistribution(config=config, source=source, features=features, data=viz_data, brand=brand) logger.info("-------------------- Visualization Completed...!!! --------------------------") logger.info("Total elapsed time: {0}".format(time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time)))) logger.info("End...!!!") except Exception as e: nlp_server.kill() logger.error("Exception: {}".format(e)) return 1
from selenium import webdriver import demoji #(pip install demoji) after that demoji.download_codes() import re import nltk nltk.download('stopwords') nltk.download('wordnet') from nltk.corpus import stopwords import string from nltk.tokenize import word_tokenize,TweetTokenizer from nltk.stem.wordnet import WordNetLemmatizer import os #import nltk from scipy.stats import entropy demoji.download_codes() # (Required for removing emojis from a text data) app = Flask(__name__) #GOOGLE_CHROME_PATH = '/app/.apt/usr/bin/google_chrome' #CHROMEDRIVER_PATH = '/app/.chromedriver/bin/chromedriver' #Load the trained models using pickle lda = pickle.load(open('lda_model','rb')) dictionary = pickle.load(open('dictonary','rb')) corpus = pickle.load(open('corpus','rb')) ## Processing Text train_data = pa.read_csv('training_data.csv')