def _classify_all(sources, dests, classifiers): """Classify all files in sources and place outputs in destinations using the classifier scripts in classifiers.""" src_dst_pairs = zip(sources, dests) num_classifiers = len(classifiers) loop = asyncio.get_event_loop() classified = 0 num_files = len(sources) LOGGER.info( f"Classifying {num_files} subfiles with {num_classifiers} classifiers." ) while classified < num_files: tasks = [ loop.create_task(_async_classify_sentiment(classifier, src, dst)) for classifier, (src, dst) in zip(classifiers, src_dst_pairs) ] loop.run_until_complete(asyncio.wait(tasks)) for task in tasks: if task.exception(): log_exception(f"Task {task} raised an exception", task.exception()) raise task.exception() classified += num_classifiers LOGGER.info(f"Subfiles classified: {classified}/{num_files}") loop.close()
def construct_stats_dataframe_from_predictions_csv(inpath: str, alpha_level: float, dfname: str, population=math.inf): """Construct a pandas dataframe from a prediction csv file. The returned dataframe contains probability that a given document in the prediction negative, positive, and neutral sentiments, as well as the corresponding standard errors. The size of standard error is dependent on zvalue. The returned dataframe will receive dfname as its name. """ predictions = read_predictions(inpath) n = predictions.shape[0] t_val = norm.ppf(1 - alpha_level / 2) # for two-tail test sentiment_counts = predictions.Predicted.value_counts().to_frame() sentiment_prob = sentiment_counts.Predicted / n margin_of_error = calculate_margin_of_error(sentiment_prob, n, population, t_val) #previous margin of error calculation: #((sentiment_prob * (1 - sentiment_prob) / n)**.5) * t_val stats_data = { 'sentiment_prob': sentiment_prob, 'margin_of_error': margin_of_error } stats = pd.DataFrame(data=stats_data).sort_index(axis='index') stats.name = dfname log = f'{dfname:23}' for i in [NEGATIVE, NEUTRAL, POSITIVE]: log += f'${stats_data["sentiment_prob"][i]:.4f}\pm{stats_data["margin_of_error"][i]:.4f}$ ' LOGGER.info(log) return stats
def _migrate_questions_from_xml_to_db(questions_xml, creation_date_start): LOGGER.info( f"Migrating questions from {questions_xml} into the database ...") _xml_to_database( questions_xml, partial(_post_xml_row_to_model, target_post_type=PostType.QUESTION), creation_date_start)
def concatenate_predictions(filepaths: list, outpath: str): assert filepaths LOGGER.info( f"Concatenating {len(filepaths)} partial documents into {outpath}") shutil.copyfile(filepaths[0], outpath) # copy first file to get header row with open(outpath, 'a') as f: for i in range(1, len(filepaths)): with open(filepaths[i], 'r') as partial_res: next(partial_res) # skip header row f.writelines(partial_res)
def _handle_analyze_parser(args): inpath = os.path.abspath(args.input) if args.rows_per_file <= 0: LOGGER.error(f"At least 1 row per file is required.") sys.exit(1) elif not os.path.isfile(inpath): LOGGER.error(f"File {inpath} does not exist!") sys.exit(1) outpath = os.path.abspath(args.output) senti4sd_pool_root = os.path.abspath(args.senti4sd_pool_root) classify.classify_sentiment(args.rows_per_file, senti4sd_pool_root, inpath, outpath)
def _handle_plot_parser(args): for path in args.input: if not os.path.isfile(os.path.abspath(path)): LOGGER.error(f"File {path} does not exist.") sys.exit(1) if args.population: if not os.path.isfile(os.path.abspath(args.population)): LOGGER.error(f"File {path} does not exist.") sys.exit(1) args.population = _parse_population(args.population) stats.plot_predictions(args.input, args.alpha_level, args.output, args.width, args.fill, args.patterns, args.population)
def _try_commit_and_flush(): """Try to commit and flush the private session. Return True if success. """ try: _session.commit() _session.flush() except Exception as e: LOGGER.error( f"Unexpected exception:\n{type(e).__name__}: {str(e)}\nRolling back" ) _session.rollback() return False return True
async def _async_classify_sentiment(path_to_classifier: str, inpath: str, outpath: str): """Run the classification task asynchronously.""" dir_name, script_name = os.path.split(path_to_classifier) out_file = os.path.basename(outpath) # the classification script must be run from the senti4SD directory command = ['/bin/bash', script_name, os.path.abspath(inpath), out_file] process = await asyncio.create_subprocess_exec(*command, cwd=dir_name) await process.communicate() if process.returncode != 0: with open(inpath, 'r') as f: LOGGER.error( f"Failed to classify {inpath} containing: {''.join(f.readlines())}" ) raise ClassificationError(f"Classifying {inpath} failed.") shutil.move(os.path.join(dir_name, out_file), outpath)
def find_classifiers(senti4sd_pool_root): """Return a list of classifier scripts found in the directory tree starting from senti4sd_pool_root. """ script_path = 'ClassificationTask/classificationTask.sh' classifiers = [ os.path.join(senti4sd_pool_root, senti_root, script_path) for senti_root in os.listdir(senti4sd_pool_root) if 'senti4sd' in senti_root.lower() and os.path.isdir(os.path.join(senti4sd_pool_root, senti_root)) ] if not classifiers: raise ValueError(f"No classifiers found in {senti4sd_pool_root}.") assert classifiers # check that we found classifiers LOGGER.info( f"Found {len(classifiers)} classifiers at: {' | '.join(classifiers)}") return classifiers
def _post_xml_row_to_model(elem, question_ids: Set[int] = None, target_post_type: PostType = PostType.QUESTION): """Convert an xml row from the Posts.xml file to a model. Text is sanitized before conversion. question_ids is only applicable if the target post type is PostType.ANSWER. An answer is only added if its parent_id is contained in question_ids. """ try: post_type = PostType(int(elem.attrib['PostTypeId'])) except ValueError: # was not a question or answer return None # early returns if target_post_type != post_type: return None if target_post_type == PostType.ANSWER and int( elem.attrib['ParentId']) not in question_ids: return None try: sanitized = sanitize_post(elem.attrib['Body']) except ValueError: LOGGER.error( f"Sanitization failed for Post with Id={elem.attrib['Id']}") return None date = MayaDT.from_rfc3339(elem.attrib['CreationDate']).date if post_type == PostType.ANSWER: title = None tags = None parent_id = elem.attrib['ParentId'] else: # is question title = elem.attrib['Title'] tags = elem.attrib['Tags'] parent_id = None post = Post(id=elem.attrib['Id'], creation_date=date, post_type_id=post_type.value, title=title, text=sanitized, tags=tags, parent_id=parent_id) return post
def chi2_test_independence(prediction_files: list, confidence_level: float): """Given a list of prediction files and a required confidence level, return whether the sentiment probability is independent on which prediction file it comes from. Returns True if the sentiment probability is independent of source.""" df = generate_sentiment_counts_multiple_files(prediction_files) observed = df[:-1].drop(columns='row_sum') expected = np.outer(df['row_sum'][:-1], df.loc['col_sum'][:-1]) / df.loc['col_sum']['row_sum'] expected = pd.DataFrame(expected) expected.columns = df.columns[:-1] expected.index = df.index[:-1] chi2_stats = ((observed - expected)**2 / expected).sum().sum() degs_of_freedom = len(observed) * len(observed.iloc[0]) critical_value = chi2.ppf(q=confidence_level, df=degs_of_freedom) p_value = 1 - chi2.cdf(x=chi2_stats, df=degs_of_freedom) LOGGER.info( f"chi2_stats = {chi2_stats}, critical_value = {critical_value}, p_value = {p_value:.10f}" ) return p_value > (1 - confidence_level)
def _xml_to_database(xml_path: str, model_function: Callable[[ElementTree.Element], Base], creation_date_start, post_ids: Set[int] = None): """Parse an xml file and add the data to the database. post_ids are only applicable for answers and comments, and are ignored for questions. An answer or comment is only added to the database if its post_id/parent_id is contained within the post_ids set. """ rows = _get_rows_from_xml(xml_path, creation_date_start) count = 0 for batch in yield_batches(rows, BATCH_SIZE): model_batch = [ e for e in (model_function(elem, post_ids) for elem in batch) if e is not None ] committed = len(model_batch) if not batch_commit(model_batch): committed = commit_all_separately(model_batch) count += committed LOGGER.info(f"Added: {count}")
def _migrate_comments_from_xml_to_db(comments_xml, creation_date_start): LOGGER.info("Retrieving post ids ...") post_ids = set(EXTRACT_FIRSTS_FROM_QUERY(query_ids_by_model(Post))) LOGGER.info(f"Found {len(post_ids)} post ids") LOGGER.info( f"Migrating comments from {comments_xml} into the database ...") _xml_to_database(comments_xml, _comment_xml_row_to_model, creation_date_start, post_ids)
def _comment_xml_row_to_model(elem, post_ids: Set[int]): """Convert an xml row from the Comments.xml file to a model. Text is sanitized before conversion. Return None if the post_id is not contained in post_ids. """ post_id = int(elem.attrib['PostId']) if post_id not in post_ids: return None try: sanitized = sanitize_comment(elem.attrib['Text']) except Exception as e: LOGGER.error( f"Sanitization failed for Comment with Id={elem.attrib['Id']}\n" f"{type(e).__name__}\n{str(e)}") return None date = MayaDT.from_rfc3339(elem.attrib['CreationDate']).date comment = Comment(id=elem.attrib['Id'], creation_date=date, text=sanitized, post_id=post_id) return comment
def _migrate_answers_from_xml_to_db(answers_xml, creation_date_start): LOGGER.info("Retrieving question ids ...") question_ids = set( EXTRACT_FIRSTS_FROM_QUERY(query_ids_by_model(Post, PostType.QUESTION))) LOGGER.info(f"Found {len(question_ids)} question ids") LOGGER.info(f"Migrating answers from {answers_xml} into the database ...") _xml_to_database( answers_xml, partial(_post_xml_row_to_model, target_post_type=PostType.ANSWER), creation_date_start, question_ids)
def handle_parsed_args(args): driver = _init_database(args) if getattr(args, SUB) == FILL: _handle_fill_parser(args) LOGGER.info(f"Database {driver.name} filled") elif getattr(args, SUB) == TEARDOWN: database.teardown_database(driver) LOGGER.info(f"Database {driver.name} torn down") elif getattr(args, SUB) == GENERATE_CSV: LOGGER.info("Generating csv file ...") index_filepath = _handle_generate_csv_parser(args) LOGGER.info( f"File generated at {args.outpath} and index file at {index_filepath}!" ) elif getattr(args, SUB) == ANALYZE: _handle_analyze_parser(args) elif getattr(args, SUB) == PLOT: _handle_plot_parser(args) else: # impossible assert False
def log_exception(pre_msg, e): LOGGER.error(f"{pre_msg}\n{type(e).__name__}: {str(e)}")