Example #1
0
    def pad_at_timestamp(self, timestamp_threshold):
        """
        Return the pad at a certain timestamp. It will contain all the operations that started before the timestamp

        :param timestamp_threshold: timestamp until which we take the operations
        :type timestamp_threshold: int
        :return: The new pad
        :rtype: Pad
        """

        new_pad = Pad(self.pad_name)
        elem_ops = []
        for elem_op in self.get_elem_ops(True):
            if elem_op.timestamp <= timestamp_threshold:
                if not elem_op.belong_to_operation in new_pad.operations:
                    new_pad.operations.append(elem_op.belong_to_operation)
                elem_ops.append(elem_op)
        pads, _, elem_ops_treated = operation_builder.build_operations_from_elem_ops({self.pad_name: elem_ops},
                                                                                     config.maximum_time_between_elem_ops)
        return pads[self.pad_name], elem_ops_treated[self.pad_name]
Example #2
0
    def run(self):
        print("Starting", self.name)
        revs_mongo = dict()
        answer = dict()
        for pad_name in self.pad_names:
            # At first we want the pads from the begining.
            revs_mongo[pad_name] = 0

        dic_author_current_operations_per_pad = dict()
        pads = dict()
        while analytics_started:
            # Parse the elementary operations from the FROG database
            new_list_of_elem_ops_per_pad, revs_mongo = parser.get_elem_ops_per_pad_from_db(
                None, 'FROG', revs_mongo=revs_mongo, regex=self.regex)
            if len(new_list_of_elem_ops_per_pad) != 0:
                # If we have new ops
                new_list_of_elem_ops_per_pad_sorted = operation_builder.sort_elem_ops_per_pad(
                    new_list_of_elem_ops_per_pad)
                pads, dic_author_current_operations_per_pad, elem_ops_treated = operation_builder.build_operations_from_elem_ops(
                    new_list_of_elem_ops_per_pad_sorted,
                    config.maximum_time_between_elem_ops,
                    dic_author_current_operations_per_pad, pads)
                # For each pad, create the paragraphs, classify the operations and create the context
                for pad_name in elem_ops_treated:
                    pad = pads[pad_name]
                    # create the paragraphs
                    pad.create_paragraphs_from_ops(elem_ops_treated[pad_name])
                    # classify the operations of the pad
                    pad.classify_operations(length_edit=config.length_edit,
                                            length_delete=config.length_delete)
                    # find the context of the operation of the pad
                    pad.build_operation_context(config.delay_sync,
                                                config.time_to_reset_day,
                                                config.time_to_reset_break)
                # We then calculate the metrics for each pad that changed
                for pad_name in elem_ops_treated:
                    print(pad_name)
                    pad = pads[pad_name]
                    answer_per_pad = dict()
                    answer_per_pad[
                        'User proportion per paragraph score'] = pad.user_participation_paragraph_score(
                        )
                    answer_per_pad['Proportion score'] = pad.prop_score()
                    answer_per_pad['Synchronous score'] = pad.sync_score()[0]
                    answer_per_pad[
                        'Alternating score'] = pad.alternating_score()
                    answer_per_pad['Break score day'] = pad.break_score('day')
                    answer_per_pad['Break score short'] = pad.break_score(
                        'short')
                    answer_per_pad[
                        'Overall write type score'] = pad.type_overall_score(
                            'write')
                    answer_per_pad[
                        'Overall paste type score'] = pad.type_overall_score(
                            'paste')
                    answer_per_pad[
                        'Overall delete type score'] = pad.type_overall_score(
                            'delete')
                    answer_per_pad[
                        'Overall edit type score'] = pad.type_overall_score(
                            'edit')
                    answer_per_pad['User write score'] = pad.user_type_score(
                        'write')
                    answer_per_pad['User paste score'] = pad.user_type_score(
                        'paste')
                    answer_per_pad['User delete score'] = pad.user_type_score(
                        'delete')
                    answer_per_pad['User edit score'] = pad.user_type_score(
                        'edit')
                    pprint(answer_per_pad)
                    answer_per_pad['text'] = pad.get_text()
                    answer_per_pad[
                        'text_colored_by_authors'] = pad.display_text_colored_by_authors(
                        )
                    answer_per_pad[
                        'text_colored_by_ops'] = pad.display_text_colored_by_ops(
                        )
                    print(answer_per_pad['text'])
                    answer[pad_name] = answer_per_pad
            time.sleep(self.update_delay)
            self.queueLock.acquire()
            if self.workQueue.full():
                queuer = self.workQueue.get()
                for pad_name in answer:
                    queuer[pad_name] = answer[pad_name]
                self.workQueue.put(queuer)
            else:
                self.workQueue.put(answer)
            self.queueLock.release()
        print('exiting', self.name)
revs_mongo = None
while True:
    if config.editor == 'etherpad':
        new_list_of_elem_ops_per_pad, index_from = parser.get_elem_ops_per_pad_from_db(
            config.path_to_db, 'etherpad', index_from_lines=index_from)
    else:
        new_list_of_elem_ops_per_pad, revs_mongo = parser.get_elem_ops_per_pad_from_db(
            None, editor=config.editor, revs_mongo=revs_mongo, regex='^editor')

    if len(new_list_of_elem_ops_per_pad) != 0:
        # sort them by their timestamps, even though they should already be sorted
        new_list_of_elem_ops_per_pad_sorted = operation_builder.sort_elem_ops_per_pad(
            new_list_of_elem_ops_per_pad)
        # Create the operations from the elementary operations
        pads, dic_author_current_operations_per_pad, elem_ops_treated = operation_builder.build_operations_from_elem_ops(
            new_list_of_elem_ops_per_pad_sorted,
            config.maximum_time_between_elem_ops,
            dic_author_current_operations_per_pad, pads)
        # For each pad, create the paragraphs, classify the operations and create the context
        for pad_name in elem_ops_treated:
            pad = pads[pad_name]
            # create the paragraphs
            pad.create_paragraphs_from_ops(elem_ops_treated[pad_name])
            # classify the operations of the pad
            pad.classify_operations(length_edit=config.length_edit,
                                    length_delete=config.length_delete)
            # find the context of the operation of the pad
            pad.build_operation_context(config.delay_sync,
                                        config.time_to_reset_day,
                                        config.time_to_reset_break)

        # For each pad, calculate the metrics
# Os walk among all the directories containing all the pads.
for (dirpath, dirnames, filenames) in os.walk(root_of_dbs):
    for filename in filenames:
        if ".db" in filename:
            path_to_db = os.path.join(dirpath, filename)
            # Fetching the new operations
            list_of_elem_ops_per_main, _ = get_elem_ops_per_pad_from_db(path_to_db=path_to_db, editor='etherpadSQLite3')
            # the pad extracted from each file always have the same name so we give them a new name based on their path
            pad_name = path_to_db[len(root_of_dbs):path_to_db.find("data") - 1]
            # We check that there is only one pad per file as there should be (one pad per session)
            assert len(list_of_elem_ops_per_main.keys()) == 1
            # we rename it
            list_of_elem_ops_per_pad[pad_name] = list_of_elem_ops_per_main['main']

# We create the operation from the list of elementary operations
pads, _, elem_ops_treated = operation_builder.build_operations_from_elem_ops(list_of_elem_ops_per_pad,
                                                                             config.maximum_time_between_elem_ops)


def find_start(pad):
    """
    Find the index of the Operation where writers start the pad.

    :return:
    """
    elem_ops = pad.get_elem_ops(sorted_=True)
    delays = []
    first = True
    for i, elem_op in enumerate(elem_ops):
        if not first:
            delays.append(elem_ops[i].timestamp - elem_ops[i - 1].timestamp)
        first = False
Example #5
0
def run(list_of_elem_ops_per_pad,
        verbosity=0,
        texts=False,
        texts_save_location=None,
        show_visualization=False,
        generate_csv=False,
        generate_csv_summary=False,
        start_time=0,
        figs_save_location=config.figs_save_location,
        maximum_time_between_elem_ops=config.maximum_time_between_elem_ops,
        length_edit=config.length_edit,
        length_delete=config.length_delete,
        delay_sync=config.delay_sync,
        time_to_reset_day=config.time_to_reset_day,
        time_to_reset_break=config.time_to_reset_break,
        print_pad_name=True,
        print_text=False,
        print_text_colored_by_authors=False,
        print_text_colored_by_ops=False,
        print_metrics_text=True,
        pads=None,
        elem_ops_treated=None):

    pads_is_None = pads is None

    if pads_is_None:
        # Build the operations from the ElementaryOperation
        # (i.e. for each pad, create a Pad object containing the (non-elem) ops)
        pads, _, elem_ops_treated = operation_builder.build_operations_from_elem_ops(
            list_of_elem_ops_per_pad, maximum_time_between_elem_ops)

    separator_char = '\t'  # For the csv files

    # I define this array here so that it is easier to comment out
    # what we're not interested in
    metric_names = [
        # From the beginning
        "user_participation_paragraph_score",
        "prop_score",
        "sync_score",
        "alternating_score",
        "break_score_day",
        "break_score_short",
        "type_overall_score_write",
        "type_overall_score_paste",
        "type_overall_score_delete",
        "type_overall_score_edit",
        "user_type_score_write",
        "user_type_score_paste",
        "user_type_score_delete",
        "user_type_score_edit",
        # From starting timestamp
        "window_sync_score",
        "window_break_score_day",
        "window_break_score_short",
        "window_type_overall_score_write",
        "window_type_overall_score_paste",
        "window_type_overall_score_delete",
        "window_type_overall_score_edit",
        "window_user_type_score_write",
        "window_user_type_score_paste",
        "window_user_type_score_delete",
        "window_user_type_score_edit",
        # Other measures
        # "added_chars",
        # "length", "length_all", "length_all_write", "length_all_paste",
        # "deleted_chars",
        # "paragraph_average_length",
        # "superparagraph_average_length",
        # "average_paragraphs_per_superparagraph",
    ]

    if generate_csv:
        header = separator_char.join([
            "docID", "author", "posStart", "posEnd", "timeStart", "timeEnd",
            "atomicOpCount", "type", "textAdded", "deletionLength",
            "paragraph", "paragraphHistory", "paragraphOriginal",
            "superparagraph", "coauthorNumber", "proportionPad",
            "proportionParagraph"
        ])
        print(header)
    elif generate_csv_summary:
        header = separator_char.join(["docID"] + metric_names)
        print(header)

    # Create content for each Pad
    pad_id = 0  # This counter is used to generate IDs for the pads

    for pad_name, pad in pads.items():
        try:
            # create the paragraphs
            if pads_is_None:
                pad.create_paragraphs_from_ops(pad.get_elem_ops(True))
            # classify the operations of the pad
            pad.classify_operations(length_edit=length_edit,
                                    length_delete=length_delete)
            # find the context of the operation of the pad
            pad.build_operation_context(delay_sync, time_to_reset_day,
                                        time_to_reset_break)
            pad_id += 1

            if generate_csv:
                # Option 1: Use the name of the pad as the pad ID
                pad.display_csv(separator_char=separator_char)

                # Option 2: Use a custom string as the pad ID
                # pad.display_csv(separator_char=separator_char,
                #                 pad_id='id{}'.format(pad_id))

            elif generate_csv_summary:
                if type(start_time) == dict:
                    st_t = start_time[pad_name]
                else:
                    st_t = start_time
                pad_metrics = pad.compute_metrics(start_time=st_t)
                pad_metrics_string = separator_char.join(
                    [format(pad_metrics[metric]) for metric in metric_names])

                # Option 1: Use the name of the pad as the pad ID
                print(pad_name + separator_char + pad_metrics_string)

                # Option 2: Use a custom string as the pad ID
                # print('id{}'.format(pad_id) + separator_char + pad_metrics_string)

            else:
                # For each Pad, add the visualization
                if texts:
                    to_print = pad.to_print(
                        print_pad_name=print_pad_name,
                        print_text=print_text,
                        print_text_colored_by_authors=
                        print_text_colored_by_authors,
                        print_text_colored_by_ops=print_text_colored_by_ops,
                        print_metrics_text=print_metrics_text)
                    if verbosity:
                        print("PRINT")
                        print(to_print)
                    if texts_save_location is not None:
                        if not os.path.isdir(texts_save_location):
                            os.makedirs(texts_save_location)
                        with open("{}/{}.txt".format(texts_save_location,
                                                     pad_name),
                                  "w+",
                                  encoding="utf-8") as f:
                            f.write(to_print)

                if show_visualization:
                    # plot the participation proportion per user per paragraphs
                    visualization.display_user_participation(
                        pad, figs_save_location)

                    visualization.display_user_participation_paragraphs(
                        pad, figs_save_location)

                    visualization.display_user_participation_paragraphs_with_del(
                        pad, figs_save_location)

                    # plot the proportion of synchronous writing per paragraphs
                    visualization.display_proportion_sync_in_paragraphs(
                        pad, figs_save_location)

                    visualization.display_proportion_sync_in_pad(
                        pad, figs_save_location)

                    # plot the overall type counts
                    visualization.display_overall_op_type(
                        pad, figs_save_location)

                    # plot the counts of type per users
                    visualization.display_types_per_user(
                        pad, figs_save_location)

                if verbosity > 1:
                    print("OPERATIONS")
                    pad.display_operations()

                    print("PARAGRAPHS:")
                    pad.display_paragraphs(verbose=1)

        except:
            print("Error at {}:".format(pad_name))
            print(traceback.format_exc())
            break

    if verbosity and elem_ops_treated is not None:
        print("{} pad(s) contain a total of {} elementary operations".format(
            len(pads),
            sum(
                len(pad_elem_ops_treated)
                for pad_elem_ops_treated in elem_ops_treated.values())))