def get_sentiment(text: str) -> Optional[NamedTuple]: """ Returns a named tuple Sentiment containing polarity and subjectivity of the input text :param text: string containing text to analyze :return: if input text non-empty, a named tuple Sentiment, None otherwise """ if is_string_empty(text): return None return TextBlob(text).sentiment
def get_subjectivity(text: str) -> Optional[float]: """ Returns the subjectivity of the input text :param text: string containing text to analyze :return: a float [0 ,1] indicating the subjectivity of the supplied text if nonempty else None """ if is_string_empty(text): return None return TextBlob(text).subjectivity
def process(self, text: str) -> Optional[str]: """ Process a single string by applying cleaning methods defined in text.py :param text: string to process :return: processed string, None if supplied or output string is None or if returned text would be empty """ if is_string_empty(text): return None text = strip_html(text) text = remove_stopwords(text) text = transform_accented_chars(text) if self.to_lowercase: text = lowercase(text) text = remove_nonalphanumeric(text) text = re.sub(" +", " ", text) text = text.strip() if is_string_empty(text): return None return text
def process_multiprocessing( self, texts: Iterable[str], file_name: str, total_count: int, batch_size: int = 100000, ): """ Parallelize the process method using n - 1 CPU cores and save results in batches to a file :param texts: an iterable with string objects :param file_name: path to the file to append preprocessed texts to :param total_count: total number of records to iterate over :param batch_size: the size of a single text list to use for parallel preprocessing :return: """ num_processed = 0 with tqdm(total=total_count) as pbar: with open(file_name, "a") as file: with Pool(NUM_CORES) as pool: while True: # Create a batch of texts as a list for use in pool.imap current_batch = [] for _ in range(batch_size): current_batch.append(next(texts)) # Break the loop if no more texts to process if len(current_batch) == 0: break # Process texts in parallel text_processed_batch = list( tqdm( pool.imap(self.process, current_batch), total=total_count, )) # Save processed texts to a file, with each text in a new line for text in text_processed_batch: if not is_string_empty(text): file.write(text + "\n") num_processed += 1 pbar.update(num_processed)
def run( data_generator, process_text: bool, batch_size: int = 100000, ): """ Compute and insert sentiment scores (polarity, subjectivity) into DB for all existing items :param data_generator: an iterable where each element is a single row from the DB :param process_text: whether to process the text or not before computing sentiment and inserting into DB :param batch_size: size of the batch to use for the named cursor when querying DB for data :return: """ logging.info("starting task %s", __name__) conn = DBConnection( user="******", password=DB_PASSWORD, db_name=DB_NAME_HACKERNEWS ) text_inserter = TextInserter(conn, TABLE_NAME_TEXTS, PRIMARY_KEY_NAME_TEXTS) sentiment_classifier = SentimentClassifier() text_preprocessor = TextPreprocessor() is_generator_exhausted = False if process_text: while not is_generator_exhausted: current_batch = [] for _ in range(batch_size): try: current_batch.append(next(data_generator)) except StopIteration: logging.info("generator %s exhausted, finishing", data_generator) is_generator_exhausted = True break if len(current_batch) == 0: break for item_id, title, text in tqdm(current_batch): # Preprocess "text" field if not empty, otherwise preprocess title (stories don't have text) if is_string_empty(text): raw_text = title else: raw_text = text text_preprocessed = text_preprocessor.process(raw_text) text_obj = Text(item_id, text_preprocessed) # Insert preprocessed text text_inserter.insert_text(text_obj) # Use unprocessed text for sentiment computation sentiment = sentiment_classifier.get_sentiment(raw_text) text_inserter.insert_sentiment(sentiment, item_id) else: while True: current_batch = [] for _ in range(batch_size): current_batch.append(next(data_generator)) if len(current_batch) == 0: break for item_id, title, text in current_batch: if is_string_empty(text): raw_text = title else: raw_text = text sentiment = sentiment_classifier.get_sentiment(raw_text) text_inserter.insert_sentiment(sentiment, item_id) logging.info("finished task: %s", __name__)