def _classify_all(sources, dests, classifiers):
    """Classify all files in sources and place outputs in destinations using
    the classifier scripts in classifiers."""
    src_dst_pairs = zip(sources, dests)
    num_classifiers = len(classifiers)

    loop = asyncio.get_event_loop()
    classified = 0
    num_files = len(sources)
    LOGGER.info(
        f"Classifying {num_files} subfiles with {num_classifiers} classifiers."
    )
    while classified < num_files:
        tasks = [
            loop.create_task(_async_classify_sentiment(classifier, src, dst))
            for classifier, (src, dst) in zip(classifiers, src_dst_pairs)
        ]
        loop.run_until_complete(asyncio.wait(tasks))
        for task in tasks:
            if task.exception():
                log_exception(f"Task {task} raised an exception",
                              task.exception())
                raise task.exception()
        classified += num_classifiers
        LOGGER.info(f"Subfiles classified: {classified}/{num_files}")
    loop.close()
Exemple #2
0
def construct_stats_dataframe_from_predictions_csv(inpath: str,
                                                   alpha_level: float,
                                                   dfname: str,
                                                   population=math.inf):
    """Construct a pandas dataframe from a prediction csv file.

    The returned dataframe contains probability that a given document in the
    prediction negative, positive, and neutral sentiments, as well as the
    corresponding standard errors. The size of standard error is dependent on
    zvalue. The returned dataframe will receive dfname as its name.
    """
    predictions = read_predictions(inpath)
    n = predictions.shape[0]
    t_val = norm.ppf(1 - alpha_level / 2)  # for two-tail test

    sentiment_counts = predictions.Predicted.value_counts().to_frame()
    sentiment_prob = sentiment_counts.Predicted / n
    margin_of_error = calculate_margin_of_error(sentiment_prob, n, population,
                                                t_val)
    #previous margin of error calculation:
    #((sentiment_prob * (1 - sentiment_prob) / n)**.5) * t_val

    stats_data = {
        'sentiment_prob': sentiment_prob,
        'margin_of_error': margin_of_error
    }
    stats = pd.DataFrame(data=stats_data).sort_index(axis='index')
    stats.name = dfname
    log = f'{dfname:23}'
    for i in [NEGATIVE, NEUTRAL, POSITIVE]:
        log += f'${stats_data["sentiment_prob"][i]:.4f}\pm{stats_data["margin_of_error"][i]:.4f}$   '
    LOGGER.info(log)
    return stats
Exemple #3
0
def _migrate_questions_from_xml_to_db(questions_xml, creation_date_start):
    LOGGER.info(
        f"Migrating questions from {questions_xml} into the database ...")
    _xml_to_database(
        questions_xml,
        partial(_post_xml_row_to_model, target_post_type=PostType.QUESTION),
        creation_date_start)
Exemple #4
0
def concatenate_predictions(filepaths: list, outpath: str):
    assert filepaths
    LOGGER.info(
        f"Concatenating {len(filepaths)} partial documents into {outpath}")
    shutil.copyfile(filepaths[0], outpath)  # copy first file to get header row
    with open(outpath, 'a') as f:
        for i in range(1, len(filepaths)):
            with open(filepaths[i], 'r') as partial_res:
                next(partial_res)  # skip header row
                f.writelines(partial_res)
Exemple #5
0
def _handle_analyze_parser(args):
    inpath = os.path.abspath(args.input)

    if args.rows_per_file <= 0:
        LOGGER.error(f"At least 1 row per file is required.")
        sys.exit(1)
    elif not os.path.isfile(inpath):
        LOGGER.error(f"File {inpath} does not exist!")
        sys.exit(1)

    outpath = os.path.abspath(args.output)
    senti4sd_pool_root = os.path.abspath(args.senti4sd_pool_root)
    classify.classify_sentiment(args.rows_per_file, senti4sd_pool_root, inpath,
                                outpath)
Exemple #6
0
def _handle_plot_parser(args):
    for path in args.input:
        if not os.path.isfile(os.path.abspath(path)):
            LOGGER.error(f"File {path} does not exist.")
            sys.exit(1)
    if args.population:
        if not os.path.isfile(os.path.abspath(args.population)):
            LOGGER.error(f"File {path} does not exist.")
            sys.exit(1)
        args.population = _parse_population(args.population)

    stats.plot_predictions(args.input, args.alpha_level, args.output,
                           args.width, args.fill, args.patterns,
                           args.population)
def _try_commit_and_flush():
    """Try to commit and flush the private session.

    Return True if success.
    """
    try:
        _session.commit()
        _session.flush()
    except Exception as e:
        LOGGER.error(
            f"Unexpected exception:\n{type(e).__name__}: {str(e)}\nRolling back"
        )
        _session.rollback()
        return False
    return True
async def _async_classify_sentiment(path_to_classifier: str, inpath: str,
                                    outpath: str):
    """Run the classification task asynchronously."""
    dir_name, script_name = os.path.split(path_to_classifier)
    out_file = os.path.basename(outpath)
    # the classification script must be run from the senti4SD directory
    command = ['/bin/bash', script_name, os.path.abspath(inpath), out_file]
    process = await asyncio.create_subprocess_exec(*command, cwd=dir_name)
    await process.communicate()
    if process.returncode != 0:
        with open(inpath, 'r') as f:
            LOGGER.error(
                f"Failed to classify {inpath} containing: {''.join(f.readlines())}"
            )
        raise ClassificationError(f"Classifying {inpath} failed.")
    shutil.move(os.path.join(dir_name, out_file), outpath)
Exemple #9
0
def find_classifiers(senti4sd_pool_root):
    """Return a list of classifier scripts found in the directory tree starting from
    senti4sd_pool_root.
    """
    script_path = 'ClassificationTask/classificationTask.sh'
    classifiers = [
        os.path.join(senti4sd_pool_root, senti_root, script_path)
        for senti_root in os.listdir(senti4sd_pool_root)
        if 'senti4sd' in senti_root.lower()
        and os.path.isdir(os.path.join(senti4sd_pool_root, senti_root))
    ]
    if not classifiers:
        raise ValueError(f"No classifiers found in {senti4sd_pool_root}.")
    assert classifiers  # check that we found classifiers
    LOGGER.info(
        f"Found {len(classifiers)} classifiers at: {' | '.join(classifiers)}")
    return classifiers
Exemple #10
0
def _post_xml_row_to_model(elem,
                           question_ids: Set[int] = None,
                           target_post_type: PostType = PostType.QUESTION):
    """Convert an xml row from the Posts.xml file to a model. Text is sanitized
    before conversion.
    
    question_ids is only applicable if the target post type is
    PostType.ANSWER. An answer is only added if its parent_id is
    contained in question_ids.
    """
    try:
        post_type = PostType(int(elem.attrib['PostTypeId']))
    except ValueError:  # was not a question or answer
        return None

    # early returns
    if target_post_type != post_type:
        return None
    if target_post_type == PostType.ANSWER and int(
            elem.attrib['ParentId']) not in question_ids:
        return None
    try:
        sanitized = sanitize_post(elem.attrib['Body'])
    except ValueError:
        LOGGER.error(
            f"Sanitization failed for Post with Id={elem.attrib['Id']}")
        return None

    date = MayaDT.from_rfc3339(elem.attrib['CreationDate']).date
    if post_type == PostType.ANSWER:
        title = None
        tags = None
        parent_id = elem.attrib['ParentId']
    else:  # is question
        title = elem.attrib['Title']
        tags = elem.attrib['Tags']
        parent_id = None
    post = Post(id=elem.attrib['Id'],
                creation_date=date,
                post_type_id=post_type.value,
                title=title,
                text=sanitized,
                tags=tags,
                parent_id=parent_id)
    return post
Exemple #11
0
def chi2_test_independence(prediction_files: list, confidence_level: float):
    """Given a list of prediction files and a required confidence level,
    return whether the sentiment probability is independent on which prediction
    file it comes from.

    Returns True if the sentiment probability is independent of source."""
    df = generate_sentiment_counts_multiple_files(prediction_files)
    observed = df[:-1].drop(columns='row_sum')
    expected = np.outer(df['row_sum'][:-1],
                        df.loc['col_sum'][:-1]) / df.loc['col_sum']['row_sum']
    expected = pd.DataFrame(expected)
    expected.columns = df.columns[:-1]
    expected.index = df.index[:-1]
    chi2_stats = ((observed - expected)**2 / expected).sum().sum()
    degs_of_freedom = len(observed) * len(observed.iloc[0])
    critical_value = chi2.ppf(q=confidence_level, df=degs_of_freedom)
    p_value = 1 - chi2.cdf(x=chi2_stats, df=degs_of_freedom)
    LOGGER.info(
        f"chi2_stats = {chi2_stats}, critical_value = {critical_value}, p_value = {p_value:.10f}"
    )
    return p_value > (1 - confidence_level)
Exemple #12
0
def _xml_to_database(xml_path: str,
                     model_function: Callable[[ElementTree.Element], Base],
                     creation_date_start,
                     post_ids: Set[int] = None):
    """Parse an xml file and add the data to the database.

    post_ids are only applicable for answers and comments, and are ignored for
    questions. An answer or comment is only added to the database if its
    post_id/parent_id is contained within the post_ids set.
    """
    rows = _get_rows_from_xml(xml_path, creation_date_start)
    count = 0
    for batch in yield_batches(rows, BATCH_SIZE):
        model_batch = [
            e for e in (model_function(elem, post_ids) for elem in batch)
            if e is not None
        ]
        committed = len(model_batch)
        if not batch_commit(model_batch):
            committed = commit_all_separately(model_batch)
        count += committed
        LOGGER.info(f"Added: {count}")
Exemple #13
0
def _migrate_comments_from_xml_to_db(comments_xml, creation_date_start):
    LOGGER.info("Retrieving post ids ...")
    post_ids = set(EXTRACT_FIRSTS_FROM_QUERY(query_ids_by_model(Post)))
    LOGGER.info(f"Found {len(post_ids)} post ids")
    LOGGER.info(
        f"Migrating comments from {comments_xml} into the database ...")
    _xml_to_database(comments_xml, _comment_xml_row_to_model,
                     creation_date_start, post_ids)
Exemple #14
0
def _comment_xml_row_to_model(elem, post_ids: Set[int]):
    """Convert an xml row from the Comments.xml file to a model. Text is
    sanitized before conversion.
    
    Return None if the post_id is not contained in post_ids.
    """
    post_id = int(elem.attrib['PostId'])
    if post_id not in post_ids:
        return None
    try:
        sanitized = sanitize_comment(elem.attrib['Text'])
    except Exception as e:
        LOGGER.error(
            f"Sanitization failed for Comment with Id={elem.attrib['Id']}\n"
            f"{type(e).__name__}\n{str(e)}")
        return None

    date = MayaDT.from_rfc3339(elem.attrib['CreationDate']).date
    comment = Comment(id=elem.attrib['Id'],
                      creation_date=date,
                      text=sanitized,
                      post_id=post_id)
    return comment
Exemple #15
0
def _migrate_answers_from_xml_to_db(answers_xml, creation_date_start):
    LOGGER.info("Retrieving question ids ...")
    question_ids = set(
        EXTRACT_FIRSTS_FROM_QUERY(query_ids_by_model(Post, PostType.QUESTION)))
    LOGGER.info(f"Found {len(question_ids)} question ids")
    LOGGER.info(f"Migrating answers from {answers_xml} into the database ...")
    _xml_to_database(
        answers_xml,
        partial(_post_xml_row_to_model, target_post_type=PostType.ANSWER),
        creation_date_start, question_ids)
Exemple #16
0
def handle_parsed_args(args):
    driver = _init_database(args)

    if getattr(args, SUB) == FILL:
        _handle_fill_parser(args)
        LOGGER.info(f"Database {driver.name} filled")
    elif getattr(args, SUB) == TEARDOWN:
        database.teardown_database(driver)
        LOGGER.info(f"Database {driver.name} torn down")
    elif getattr(args, SUB) == GENERATE_CSV:
        LOGGER.info("Generating csv file ...")
        index_filepath = _handle_generate_csv_parser(args)
        LOGGER.info(
            f"File generated at {args.outpath} and index file at {index_filepath}!"
        )
    elif getattr(args, SUB) == ANALYZE:
        _handle_analyze_parser(args)
    elif getattr(args, SUB) == PLOT:
        _handle_plot_parser(args)
    else:  # impossible
        assert False
Exemple #17
0
def log_exception(pre_msg, e):
    LOGGER.error(f"{pre_msg}\n{type(e).__name__}: {str(e)}")