def construct_stats_dataframe_from_predictions_csv(inpath: str, alpha_level: float, dfname: str, population=math.inf): """Construct a pandas dataframe from a prediction csv file. The returned dataframe contains probability that a given document in the prediction negative, positive, and neutral sentiments, as well as the corresponding standard errors. The size of standard error is dependent on zvalue. The returned dataframe will receive dfname as its name. """ predictions = read_predictions(inpath) n = predictions.shape[0] t_val = norm.ppf(1 - alpha_level / 2) # for two-tail test sentiment_counts = predictions.Predicted.value_counts().to_frame() sentiment_prob = sentiment_counts.Predicted / n margin_of_error = calculate_margin_of_error(sentiment_prob, n, population, t_val) #previous margin of error calculation: #((sentiment_prob * (1 - sentiment_prob) / n)**.5) * t_val stats_data = { 'sentiment_prob': sentiment_prob, 'margin_of_error': margin_of_error } stats = pd.DataFrame(data=stats_data).sort_index(axis='index') stats.name = dfname log = f'{dfname:23}' for i in [NEGATIVE, NEUTRAL, POSITIVE]: log += f'${stats_data["sentiment_prob"][i]:.4f}\pm{stats_data["margin_of_error"][i]:.4f}$ ' LOGGER.info(log) return stats
def _classify_all(sources, dests, classifiers): """Classify all files in sources and place outputs in destinations using the classifier scripts in classifiers.""" src_dst_pairs = zip(sources, dests) num_classifiers = len(classifiers) loop = asyncio.get_event_loop() classified = 0 num_files = len(sources) LOGGER.info( f"Classifying {num_files} subfiles with {num_classifiers} classifiers." ) while classified < num_files: tasks = [ loop.create_task(_async_classify_sentiment(classifier, src, dst)) for classifier, (src, dst) in zip(classifiers, src_dst_pairs) ] loop.run_until_complete(asyncio.wait(tasks)) for task in tasks: if task.exception(): log_exception(f"Task {task} raised an exception", task.exception()) raise task.exception() classified += num_classifiers LOGGER.info(f"Subfiles classified: {classified}/{num_files}") loop.close()
def _migrate_questions_from_xml_to_db(questions_xml, creation_date_start): LOGGER.info( f"Migrating questions from {questions_xml} into the database ...") _xml_to_database( questions_xml, partial(_post_xml_row_to_model, target_post_type=PostType.QUESTION), creation_date_start)
def concatenate_predictions(filepaths: list, outpath: str): assert filepaths LOGGER.info( f"Concatenating {len(filepaths)} partial documents into {outpath}") shutil.copyfile(filepaths[0], outpath) # copy first file to get header row with open(outpath, 'a') as f: for i in range(1, len(filepaths)): with open(filepaths[i], 'r') as partial_res: next(partial_res) # skip header row f.writelines(partial_res)
def find_classifiers(senti4sd_pool_root): """Return a list of classifier scripts found in the directory tree starting from senti4sd_pool_root. """ script_path = 'ClassificationTask/classificationTask.sh' classifiers = [ os.path.join(senti4sd_pool_root, senti_root, script_path) for senti_root in os.listdir(senti4sd_pool_root) if 'senti4sd' in senti_root.lower() and os.path.isdir(os.path.join(senti4sd_pool_root, senti_root)) ] if not classifiers: raise ValueError(f"No classifiers found in {senti4sd_pool_root}.") assert classifiers # check that we found classifiers LOGGER.info( f"Found {len(classifiers)} classifiers at: {' | '.join(classifiers)}") return classifiers
def chi2_test_independence(prediction_files: list, confidence_level: float): """Given a list of prediction files and a required confidence level, return whether the sentiment probability is independent on which prediction file it comes from. Returns True if the sentiment probability is independent of source.""" df = generate_sentiment_counts_multiple_files(prediction_files) observed = df[:-1].drop(columns='row_sum') expected = np.outer(df['row_sum'][:-1], df.loc['col_sum'][:-1]) / df.loc['col_sum']['row_sum'] expected = pd.DataFrame(expected) expected.columns = df.columns[:-1] expected.index = df.index[:-1] chi2_stats = ((observed - expected)**2 / expected).sum().sum() degs_of_freedom = len(observed) * len(observed.iloc[0]) critical_value = chi2.ppf(q=confidence_level, df=degs_of_freedom) p_value = 1 - chi2.cdf(x=chi2_stats, df=degs_of_freedom) LOGGER.info( f"chi2_stats = {chi2_stats}, critical_value = {critical_value}, p_value = {p_value:.10f}" ) return p_value > (1 - confidence_level)
def _xml_to_database(xml_path: str, model_function: Callable[[ElementTree.Element], Base], creation_date_start, post_ids: Set[int] = None): """Parse an xml file and add the data to the database. post_ids are only applicable for answers and comments, and are ignored for questions. An answer or comment is only added to the database if its post_id/parent_id is contained within the post_ids set. """ rows = _get_rows_from_xml(xml_path, creation_date_start) count = 0 for batch in yield_batches(rows, BATCH_SIZE): model_batch = [ e for e in (model_function(elem, post_ids) for elem in batch) if e is not None ] committed = len(model_batch) if not batch_commit(model_batch): committed = commit_all_separately(model_batch) count += committed LOGGER.info(f"Added: {count}")
def _migrate_comments_from_xml_to_db(comments_xml, creation_date_start): LOGGER.info("Retrieving post ids ...") post_ids = set(EXTRACT_FIRSTS_FROM_QUERY(query_ids_by_model(Post))) LOGGER.info(f"Found {len(post_ids)} post ids") LOGGER.info( f"Migrating comments from {comments_xml} into the database ...") _xml_to_database(comments_xml, _comment_xml_row_to_model, creation_date_start, post_ids)
def _migrate_answers_from_xml_to_db(answers_xml, creation_date_start): LOGGER.info("Retrieving question ids ...") question_ids = set( EXTRACT_FIRSTS_FROM_QUERY(query_ids_by_model(Post, PostType.QUESTION))) LOGGER.info(f"Found {len(question_ids)} question ids") LOGGER.info(f"Migrating answers from {answers_xml} into the database ...") _xml_to_database( answers_xml, partial(_post_xml_row_to_model, target_post_type=PostType.ANSWER), creation_date_start, question_ids)
def handle_parsed_args(args): driver = _init_database(args) if getattr(args, SUB) == FILL: _handle_fill_parser(args) LOGGER.info(f"Database {driver.name} filled") elif getattr(args, SUB) == TEARDOWN: database.teardown_database(driver) LOGGER.info(f"Database {driver.name} torn down") elif getattr(args, SUB) == GENERATE_CSV: LOGGER.info("Generating csv file ...") index_filepath = _handle_generate_csv_parser(args) LOGGER.info( f"File generated at {args.outpath} and index file at {index_filepath}!" ) elif getattr(args, SUB) == ANALYZE: _handle_analyze_parser(args) elif getattr(args, SUB) == PLOT: _handle_plot_parser(args) else: # impossible assert False