def load_game_comments(game_comments_dir): '''Load GameComments data into a solr instance. :param game_comments_dir: Directory containing XDATA NBA GameComments JSON files to load into a Solr instance. ''' logger.info('Starting GameComments ingestion: ' + game_comments_dir) # Train the sentiment analyser that we'll use when processing # all the game comments. logger.info('Training sentiment analyser for comment ingestion') SentimentAnalyser.train() logger.info('Sentiment analyser training complete') # Get a list of all the files we need to load data_files = [ os.path.join(game_comments_dir, f) for f in os.listdir(game_comments_dir) if os.path.isfile(os.path.join(game_comments_dir, f)) ] # Determine the number of threads it will take to process that many files total_threads = len(data_files) / GAME_COMMENTS_FILES_PER_THREAD # If the number of files isn't evenly divisible by the number of files # per thread that we want to use we need to compensate for the remainder. total_threads += 1 if len( data_files) % GAME_COMMENTS_FILES_PER_THREAD else 0 # Split the data files into chunks to pass to each thread. fpt = GAME_COMMENTS_FILES_PER_THREAD split_data_files = [ data_files[(fpt * index):(fpt * index) + fpt] for index in range(total_threads) ] # Process all the files! thread_pool = multiprocessing.Pool(total_threads) results = thread_pool.map(load_game_comments_files, split_data_files) thread_pool.close() thread_pool.join() # Join result set here results = list(itertools.chain.from_iterable(results)) # Send single hit to Solr here solr_url = SOLR_URL + GAME_COMMENTS_CORE + 'update?commit=true' data = etl.prepareDocsForSolr(results, unmarshall=False, encoding='latin-1') etl.postJsonDocToSolr(solr_url, data) logger.info('GameComments ingestions complete')
def parse_comment_files(comment_file): logger.debug(comment_file) path, file_name = os.path.split(comment_file) file_name_split = file_name.split('_') game_id = file_name_split[0] source = file_name_split[1] logger.debug('Processing comment files for game: ' + str(game_id)) records = [] with open(comment_file, 'r') as comment_in: for index, line in enumerate(comment_in): split_line = line.split('::') records.append({ 'id': str(game_id) + '_' + str(index), 'game_id': game_id, 'comment_order': index, 'commenter': split_line[0], 'comment': split_line[1], 'source': source, 'sentiment': SentimentAnalyser.classify(split_line[1]) }) return records
def load_game_comments(game_comments_dir): '''Load GameComments data into a solr instance. :param game_comments_dir: Directory containing XDATA NBA GameComments JSON files to load into a Solr instance. ''' logger.info('Starting GameComments ingestion: ' + game_comments_dir) # Train the sentiment analyser that we'll use when processing # all the game comments. logger.info('Training sentiment analyser for comment ingestion') SentimentAnalyser.train() logger.info('Sentiment analyser training complete') # Get a list of all the files we need to load data_files = [os.path.join(game_comments_dir, f) for f in os.listdir(game_comments_dir) if os.path.isfile(os.path.join(game_comments_dir, f))] # Determine the number of threads it will take to process that many files total_threads = len(data_files) / GAME_COMMENTS_FILES_PER_THREAD # If the number of files isn't evenly divisible by the number of files # per thread that we want to use we need to compensate for the remainder. total_threads += 1 if len(data_files) % GAME_COMMENTS_FILES_PER_THREAD else 0 # Split the data files into chunks to pass to each thread. fpt = GAME_COMMENTS_FILES_PER_THREAD split_data_files = [data_files[(fpt * index):(fpt * index) + fpt] for index in range(total_threads)] # Process all the files! thread_pool = multiprocessing.Pool(total_threads) results = thread_pool.map(load_game_comments_files, split_data_files) thread_pool.close() thread_pool.join() # Join result set here results = list(itertools.chain.from_iterable(results)) # Send single hit to Solr here solr_url = SOLR_URL + GAME_COMMENTS_CORE + 'update?commit=true' data = etl.prepareDocsForSolr(results, unmarshall=False, encoding='latin-1') etl.postJsonDocToSolr(solr_url, data) logger.info('GameComments ingestions complete')
import SentimentAnalyser as sa print(sa.classify("ajshjsahf"))
def predict(query): sentiment = SentimentAnalyser.returnSentiment(query) return jsonify(sentiment)
try: mydb = mysql.connector.connect(host="localhost", user="******", passwd="", database="SentiStock") mycursor = mydb.cursor() sql = "INSERT INTO SS_News(ss_news_id,ss_source, ss_link, ss_time,ss_entry_time, ss_title, ss_image_link, ss_description, ss_sentiments, ss_symbol, ss_category,ss_full_description) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" for i in range(0, len(out_df)): sent = 'NA' symbol = 'NA' description = str(out_df['description'][i]).replace("#39", "'").strip() if out_df['title'][i] != None and out_df['title'][i] != ' ': sent = sa.classify(out_df['title'][i]) symbol = se.ExtractSymbol(out_df['title'][i]) if symbol == '': symbol = 'NA' values = (str(out_df['titleDigest'][i]), str(out_df['source'][i]), str(out_df['link'][i]), str(out_df['date'][i]), getCurrentDateTime(), out_df['title'][i], str(out_df['imagesrc'][i]), str(out_df['description'][i]), sent, symbol, str(out_df['category'][i]), str(out_df['description'][i])) mycursor.execute(sql, values) mydb.commit() count += 1 print(count, "record inserted.") # ClubbSimilar(out_df['title'][i], out_df['titleDigest'][i])
ts = pd.Timestamp(year=int(temp1[0]), month=int(temp1[1]), day=int( temp1[2]), hour=int(temp2[0]), minute=int(temp2[1]), second=int(temp2[2]), tz='utc') ts = ts.to_julian_date() cts = pd.Timestamp(year=1990, month=1, day=1, hour=0, minute=0, second=0, tz='utc') cts = cts.now() - pd.Timedelta('1 day') cts = int(cts.to_julian_date()) if(ts < cts): flag = 1 else: flag = 0 elif child.tag == 'image': imagesrc = child.text if child is not None else None if(flag == 1): continue sent = sa.classify(title) if title is not None else None symbol = se.ExtractSymbol(title)if title is not None else None if symbol == '': symbol = 'NA' print(symbol) if(flag == 0): out_df = out_df.append(pd.Series( [title, description, source, link, date, imagesrc, sent, symbol, category], index=df_cols), ignore_index=True) mydb = mysql.connector.connect(host="192.168.2.89", user="******", passwd="uatmysql", database="PythonNews") mycursor = mydb.cursor() <<<<<<< HEAD
import SentimentAnalyser query = "This feels great" sentiment = SentimentAnalyser.returnSentiment(query) print(sentiment)