예제 #1
0
def construct_stats_dataframe_from_predictions_csv(inpath: str,
                                                   alpha_level: float,
                                                   dfname: str,
                                                   population=math.inf):
    """Construct a pandas dataframe from a prediction csv file.

    The returned dataframe contains probability that a given document in the
    prediction negative, positive, and neutral sentiments, as well as the
    corresponding standard errors. The size of standard error is dependent on
    zvalue. The returned dataframe will receive dfname as its name.
    """
    predictions = read_predictions(inpath)
    n = predictions.shape[0]
    t_val = norm.ppf(1 - alpha_level / 2)  # for two-tail test

    sentiment_counts = predictions.Predicted.value_counts().to_frame()
    sentiment_prob = sentiment_counts.Predicted / n
    margin_of_error = calculate_margin_of_error(sentiment_prob, n, population,
                                                t_val)
    #previous margin of error calculation:
    #((sentiment_prob * (1 - sentiment_prob) / n)**.5) * t_val

    stats_data = {
        'sentiment_prob': sentiment_prob,
        'margin_of_error': margin_of_error
    }
    stats = pd.DataFrame(data=stats_data).sort_index(axis='index')
    stats.name = dfname
    log = f'{dfname:23}'
    for i in [NEGATIVE, NEUTRAL, POSITIVE]:
        log += f'${stats_data["sentiment_prob"][i]:.4f}\pm{stats_data["margin_of_error"][i]:.4f}$   '
    LOGGER.info(log)
    return stats
def _classify_all(sources, dests, classifiers):
    """Classify all files in sources and place outputs in destinations using
    the classifier scripts in classifiers."""
    src_dst_pairs = zip(sources, dests)
    num_classifiers = len(classifiers)

    loop = asyncio.get_event_loop()
    classified = 0
    num_files = len(sources)
    LOGGER.info(
        f"Classifying {num_files} subfiles with {num_classifiers} classifiers."
    )
    while classified < num_files:
        tasks = [
            loop.create_task(_async_classify_sentiment(classifier, src, dst))
            for classifier, (src, dst) in zip(classifiers, src_dst_pairs)
        ]
        loop.run_until_complete(asyncio.wait(tasks))
        for task in tasks:
            if task.exception():
                log_exception(f"Task {task} raised an exception",
                              task.exception())
                raise task.exception()
        classified += num_classifiers
        LOGGER.info(f"Subfiles classified: {classified}/{num_files}")
    loop.close()
예제 #3
0
def _migrate_questions_from_xml_to_db(questions_xml, creation_date_start):
    LOGGER.info(
        f"Migrating questions from {questions_xml} into the database ...")
    _xml_to_database(
        questions_xml,
        partial(_post_xml_row_to_model, target_post_type=PostType.QUESTION),
        creation_date_start)
예제 #4
0
def concatenate_predictions(filepaths: list, outpath: str):
    assert filepaths
    LOGGER.info(
        f"Concatenating {len(filepaths)} partial documents into {outpath}")
    shutil.copyfile(filepaths[0], outpath)  # copy first file to get header row
    with open(outpath, 'a') as f:
        for i in range(1, len(filepaths)):
            with open(filepaths[i], 'r') as partial_res:
                next(partial_res)  # skip header row
                f.writelines(partial_res)
예제 #5
0
def find_classifiers(senti4sd_pool_root):
    """Return a list of classifier scripts found in the directory tree starting from
    senti4sd_pool_root.
    """
    script_path = 'ClassificationTask/classificationTask.sh'
    classifiers = [
        os.path.join(senti4sd_pool_root, senti_root, script_path)
        for senti_root in os.listdir(senti4sd_pool_root)
        if 'senti4sd' in senti_root.lower()
        and os.path.isdir(os.path.join(senti4sd_pool_root, senti_root))
    ]
    if not classifiers:
        raise ValueError(f"No classifiers found in {senti4sd_pool_root}.")
    assert classifiers  # check that we found classifiers
    LOGGER.info(
        f"Found {len(classifiers)} classifiers at: {' | '.join(classifiers)}")
    return classifiers
예제 #6
0
def chi2_test_independence(prediction_files: list, confidence_level: float):
    """Given a list of prediction files and a required confidence level,
    return whether the sentiment probability is independent on which prediction
    file it comes from.

    Returns True if the sentiment probability is independent of source."""
    df = generate_sentiment_counts_multiple_files(prediction_files)
    observed = df[:-1].drop(columns='row_sum')
    expected = np.outer(df['row_sum'][:-1],
                        df.loc['col_sum'][:-1]) / df.loc['col_sum']['row_sum']
    expected = pd.DataFrame(expected)
    expected.columns = df.columns[:-1]
    expected.index = df.index[:-1]
    chi2_stats = ((observed - expected)**2 / expected).sum().sum()
    degs_of_freedom = len(observed) * len(observed.iloc[0])
    critical_value = chi2.ppf(q=confidence_level, df=degs_of_freedom)
    p_value = 1 - chi2.cdf(x=chi2_stats, df=degs_of_freedom)
    LOGGER.info(
        f"chi2_stats = {chi2_stats}, critical_value = {critical_value}, p_value = {p_value:.10f}"
    )
    return p_value > (1 - confidence_level)
예제 #7
0
def _xml_to_database(xml_path: str,
                     model_function: Callable[[ElementTree.Element], Base],
                     creation_date_start,
                     post_ids: Set[int] = None):
    """Parse an xml file and add the data to the database.

    post_ids are only applicable for answers and comments, and are ignored for
    questions. An answer or comment is only added to the database if its
    post_id/parent_id is contained within the post_ids set.
    """
    rows = _get_rows_from_xml(xml_path, creation_date_start)
    count = 0
    for batch in yield_batches(rows, BATCH_SIZE):
        model_batch = [
            e for e in (model_function(elem, post_ids) for elem in batch)
            if e is not None
        ]
        committed = len(model_batch)
        if not batch_commit(model_batch):
            committed = commit_all_separately(model_batch)
        count += committed
        LOGGER.info(f"Added: {count}")
예제 #8
0
def _migrate_comments_from_xml_to_db(comments_xml, creation_date_start):
    LOGGER.info("Retrieving post ids ...")
    post_ids = set(EXTRACT_FIRSTS_FROM_QUERY(query_ids_by_model(Post)))
    LOGGER.info(f"Found {len(post_ids)} post ids")
    LOGGER.info(
        f"Migrating comments from {comments_xml} into the database ...")
    _xml_to_database(comments_xml, _comment_xml_row_to_model,
                     creation_date_start, post_ids)
예제 #9
0
def _migrate_answers_from_xml_to_db(answers_xml, creation_date_start):
    LOGGER.info("Retrieving question ids ...")
    question_ids = set(
        EXTRACT_FIRSTS_FROM_QUERY(query_ids_by_model(Post, PostType.QUESTION)))
    LOGGER.info(f"Found {len(question_ids)} question ids")
    LOGGER.info(f"Migrating answers from {answers_xml} into the database ...")
    _xml_to_database(
        answers_xml,
        partial(_post_xml_row_to_model, target_post_type=PostType.ANSWER),
        creation_date_start, question_ids)
예제 #10
0
def handle_parsed_args(args):
    driver = _init_database(args)

    if getattr(args, SUB) == FILL:
        _handle_fill_parser(args)
        LOGGER.info(f"Database {driver.name} filled")
    elif getattr(args, SUB) == TEARDOWN:
        database.teardown_database(driver)
        LOGGER.info(f"Database {driver.name} torn down")
    elif getattr(args, SUB) == GENERATE_CSV:
        LOGGER.info("Generating csv file ...")
        index_filepath = _handle_generate_csv_parser(args)
        LOGGER.info(
            f"File generated at {args.outpath} and index file at {index_filepath}!"
        )
    elif getattr(args, SUB) == ANALYZE:
        _handle_analyze_parser(args)
    elif getattr(args, SUB) == PLOT:
        _handle_plot_parser(args)
    else:  # impossible
        assert False