Exemple #1
0
def analyseParagraphsInRevision(revision_curr, revision_prev, text_curr):

    # Containers for unmatched and matched paragraphs.
    unmatched_paragraphs_curr = []
    unmatched_paragraphs_prev = []
    matched_paragraphs_prev = []
    
    # Split the text of the current into paragraphs.
    paragraphs = Text.splitIntoParagraphs(text_curr)
    
    # Iterate over the paragraphs of the current version.
    for paragraph in paragraphs:
        
        # Build Paragraph structure and calculate hash value.
        paragraph = paragraph.strip()
        hash_curr = Text.calculateHash(paragraph)
        matched_curr = False
                    
        # If the paragraph is in the previous revision, 
        # update the authorship information and mark both paragraphs as matched (also in HT).
        if (hash_curr in revision_prev.ordered_paragraphs):

            for paragraph_prev in revision_prev.paragraphs[hash_curr]:
                if (not paragraph_prev.matched):
                    matched_curr = True 
                    paragraph_prev.matched = True
                    matched_paragraphs_prev.append(paragraph_prev)
                    
                    # TODO: added this (CHECK).
                    for hash_sentence_prev in paragraph_prev.sentences.keys():
                        for sentence_prev in paragraph_prev.sentences[hash_sentence_prev]:
                            sentence_prev.matched = True
                            for word_prev in sentence_prev.words:
                                word_prev.matched = True
                    
                    # Add paragraph to current revision.
                    if (hash_curr in revision_curr.paragraphs.keys()):
                        revision_curr.paragraphs[paragraph_prev.hash_value].append(paragraph_prev)
                        revision_curr.ordered_paragraphs.append(paragraph_prev.hash_value)
                    else:
                        revision_curr.paragraphs.update({paragraph_prev.hash_value : [paragraph_prev]})
                        revision_curr.ordered_paragraphs.append(paragraph_prev.hash_value)

                    break

                    
        # If the paragraph is not in the previous revision, but it is in an older revision
        # update the authorship information and mark both paragraphs as matched. 
        if ((not matched_curr) and (hash_curr in paragraphs_ht)):
            for paragraph_prev in paragraphs_ht[hash_curr]:
                if (not paragraph_prev.matched):
                    matched_curr = True
                    paragraph_prev.matched = True
                    matched_paragraphs_prev.append(paragraph_prev)
                    
                    # TODO: added this (CHECK).
                    for hash_sentence_prev in paragraph_prev.sentences.keys():
                        for sentence_prev in paragraph_prev.sentences[hash_sentence_prev]:
                            sentence_prev.matched = True
                            for word_prev in sentence_prev.words:
                                word_prev.matched = True

                    
                    # Add paragraph to current revision.
                    if (hash_curr in revision_curr.paragraphs.keys()):
                        revision_curr.paragraphs[paragraph_prev.hash_value].append(paragraph_prev)
                        revision_curr.ordered_paragraphs.append(paragraph_prev.hash_value)
                    else:
                        revision_curr.paragraphs.update({paragraph_prev.hash_value : [paragraph_prev]})
                        revision_curr.ordered_paragraphs.append(paragraph_prev.hash_value)
                    
                    break
            
        # If the paragraph did not match with previous revisions,
        # add to container of unmatched paragraphs for further analysis.
        if (not matched_curr):
            paragraph_curr = Paragraph()
            paragraph_curr.hash_value = Text.calculateHash(paragraph)
            paragraph_curr.value = paragraph

            revision_curr.ordered_paragraphs.append(paragraph_curr.hash_value)
            
            if (paragraph_curr.hash_value in revision_curr.paragraphs.keys()):
                revision_curr.paragraphs[paragraph_curr.hash_value].append(paragraph_curr)
            else:
                revision_curr.paragraphs.update({paragraph_curr.hash_value : [paragraph_curr]})
            
            unmatched_paragraphs_curr.append(paragraph_curr)  
                  
     
    # Identify unmatched paragraphs in previous revision for further analysis.        
    for paragraph_prev_hash in revision_prev.ordered_paragraphs:
        for paragraph_prev in revision_prev.paragraphs[paragraph_prev_hash]:
            if (not paragraph_prev.matched):
                unmatched_paragraphs_prev.append(paragraph_prev)

    return (unmatched_paragraphs_curr, unmatched_paragraphs_prev, matched_paragraphs_prev)
def analyseParagraphsInRevision(revision_curr, revision_prev, text_curr, relation):

    # Containers for unmatched and matched paragraphs.
    unmatched_paragraphs_curr = []
    unmatched_paragraphs_prev = []
    matched_paragraphs_prev = []
    
    # Split the text of the current into paragraphs.
    paragraphs = Text.splitIntoParagraphs(text_curr)
    
    # Iterate over the paragraphs of the current version.
    for paragraph in paragraphs:
        
        # Build Paragraph structure and calculate hash value.
        paragraph = paragraph.strip()
        hash_curr = Text.calculateHash(paragraph)
        matched_curr = False
                    
        # If the paragraph is in the previous revision, 
        # update the authorship information and mark both paragraphs as matched (also in HT).
        if (hash_curr in revision_prev.ordered_paragraphs):

            for paragraph_prev in revision_prev.paragraphs[hash_curr]:
                if (not paragraph_prev.matched):
                    matched_curr = True 
                    paragraph_prev.matched = True
                    matched_paragraphs_prev.append(paragraph_prev)
                    
                    # TODO: added this (CHECK).
                    for hash_sentence_prev in paragraph_prev.sentences.keys():
                        for sentence_prev in paragraph_prev.sentences[hash_sentence_prev]:
                            sentence_prev.matched = True
                            for word_prev in sentence_prev.words:
                                word_prev.matched = True
                                word_prev.used.append(revision_curr.wikipedia_id)
                                
                                #if (word_prev.revision in relation.reintroduced.keys()):
                                #    relation.reintroduced.update({word_prev.revision : relation.reintroduced[word_prev.revision] + 1 })
                                #else:
                                #    relation.reintroduced.update({word_prev.revision : 1 })
                    
                    # Add paragraph to current revision.
                    if (hash_curr in revision_curr.paragraphs.keys()):
                        revision_curr.paragraphs[paragraph_prev.hash_value].append(paragraph_prev)
                        revision_curr.ordered_paragraphs.append(paragraph_prev.hash_value)
                    else:
                        revision_curr.paragraphs.update({paragraph_prev.hash_value : [paragraph_prev]})
                        revision_curr.ordered_paragraphs.append(paragraph_prev.hash_value)

                    break

                    
        # If the paragraph is not in the previous revision, but it is in an older revision
        # update the authorship information and mark both paragraphs as matched. 
        if ((not matched_curr) and (hash_curr in paragraphs_ht)):
            for paragraph_prev in paragraphs_ht[hash_curr]:
                if (not paragraph_prev.matched):
                    matched_curr = True
                    paragraph_prev.matched = True
                    matched_paragraphs_prev.append(paragraph_prev)
                    
                    # TODO: added this (CHECK).
                    for hash_sentence_prev in paragraph_prev.sentences.keys():
                        for sentence_prev in paragraph_prev.sentences[hash_sentence_prev]:
                            sentence_prev.matched = True
                            for word_prev in sentence_prev.words:
                                word_prev.matched = True
                                word_prev.used.append(revision_curr.wikipedia_id)
                                
                                if (revision_prev.wikipedia_id not in word_prev.used):
                                    word_prev.freq.append(revision_curr.wikipedia_id)
                                
                                # Revert: reintroducing something that somebody else deleted, 
                                # (and was not used in the previous revision)
                                if (revision_prev.wikipedia_id not in word_prev.used):
                                    #if (revision_curr.wikipedia_id == 11):
                                    #    print "Revert in 11", word_prev.value, word_prev.deleted, relation.revert
                                    
                                    for elem in word_prev.deleted:
                                        if (elem in revisions.keys()):
                                            if (revisions[elem].contributor_name != revision_curr.contributor_name):
                                                if (elem in relation.revert.keys()):
                                                    relation.revert.update({elem : relation.revert[elem] + 1})
                                                else:
                                                    relation.revert.update({elem : 1})
                                            else:
                                                if (elem in relation.self_revert.keys()):
                                                    relation.self_revert.update({elem : relation.self_revert[elem] + 1})
                                                else:
                                                    relation.self_revert.update({elem : 1})
                                
                                 
                                if (revision_prev.wikipedia_id not in word_prev.used):
                                    if (elem in revisions.keys()):
                                        if (revisions[word_prev.revision].contributor_name != revision_curr.contributor_name):
                                            if (word_prev.revision in relation.reintroduced.keys()):
                                                relation.reintroduced.update({word_prev.revision : relation.reintroduced[word_prev.revision] + 1 })
                                            else:
                                                relation.reintroduced.update({word_prev.revision : 1 })
                                        else:
                                            if (word_prev.revision in relation.self_reintroduced.keys()):
                                                relation.self_reintroduced.update({word_prev.revision : relation.self_reintroduced[word_prev.revision] + 1})
                                            else:
                                                relation.self_reintroduced.update({word_prev.revision : 1})
                    
                    # Add paragraph to current revision.
                    if (hash_curr in revision_curr.paragraphs.keys()):
                        revision_curr.paragraphs[paragraph_prev.hash_value].append(paragraph_prev)
                        revision_curr.ordered_paragraphs.append(paragraph_prev.hash_value)
                    else:
                        revision_curr.paragraphs.update({paragraph_prev.hash_value : [paragraph_prev]})
                        revision_curr.ordered_paragraphs.append(paragraph_prev.hash_value)
                    
                    break
            
        # If the paragraph did not match with previous revisions,
        # add to container of unmatched paragraphs for further analysis.
        if (not matched_curr):
            paragraph_curr = Paragraph()
            paragraph_curr.hash_value = Text.calculateHash(paragraph)
            paragraph_curr.value = paragraph

            revision_curr.ordered_paragraphs.append(paragraph_curr.hash_value)
            
            if (paragraph_curr.hash_value in revision_curr.paragraphs.keys()):
                revision_curr.paragraphs[paragraph_curr.hash_value].append(paragraph_curr)
            else:
                revision_curr.paragraphs.update({paragraph_curr.hash_value : [paragraph_curr]})
            
            unmatched_paragraphs_curr.append(paragraph_curr)  
                  
     
    # Identify unmatched paragraphs in previous revision for further analysis.        
    for paragraph_prev_hash in revision_prev.ordered_paragraphs:
        for paragraph_prev in revision_prev.paragraphs[paragraph_prev_hash]:
            if (not paragraph_prev.matched):
                unmatched_paragraphs_prev.append(paragraph_prev)

    return (unmatched_paragraphs_curr, unmatched_paragraphs_prev, matched_paragraphs_prev)
Exemple #3
0
def analyseParagraphsInRevision(revision_curr, revision_prev, text_curr,
                                relation):

    # Containers for unmatched and matched paragraphs.
    unmatched_paragraphs_curr = []
    unmatched_paragraphs_prev = []
    matched_paragraphs_prev = []

    # Split the text of the current into paragraphs.
    paragraphs = Text.splitIntoParagraphs(text_curr)

    # Iterate over the paragraphs of the current version.
    for paragraph in paragraphs:

        # Build Paragraph structure and calculate hash value.
        paragraph = paragraph.strip()
        hash_curr = Text.calculateHash(paragraph)
        matched_curr = False

        # If the paragraph is in the previous revision,
        # update the authorship information and mark both paragraphs as matched (also in HT).
        if (hash_curr in revision_prev.ordered_paragraphs):

            for paragraph_prev in revision_prev.paragraphs[hash_curr]:
                if (not paragraph_prev.matched):
                    matched_curr = True
                    paragraph_prev.matched = True
                    matched_paragraphs_prev.append(paragraph_prev)

                    # TODO: added this (CHECK).
                    for hash_sentence_prev in paragraph_prev.sentences.keys():
                        for sentence_prev in paragraph_prev.sentences[
                                hash_sentence_prev]:
                            sentence_prev.matched = True
                            for word_prev in sentence_prev.words:
                                word_prev.matched = True
                                word_prev.used.append(
                                    revision_curr.wikipedia_id)

                                #if (word_prev.revision in relation.reintroduced.keys()):
                                #    relation.reintroduced.update({word_prev.revision : relation.reintroduced[word_prev.revision] + 1 })
                                #else:
                                #    relation.reintroduced.update({word_prev.revision : 1 })

                    # Add paragraph to current revision.
                    if (hash_curr in revision_curr.paragraphs.keys()):
                        revision_curr.paragraphs[
                            paragraph_prev.hash_value].append(paragraph_prev)
                        revision_curr.ordered_paragraphs.append(
                            paragraph_prev.hash_value)
                    else:
                        revision_curr.paragraphs.update(
                            {paragraph_prev.hash_value: [paragraph_prev]})
                        revision_curr.ordered_paragraphs.append(
                            paragraph_prev.hash_value)

                    break

        # If the paragraph is not in the previous revision, but it is in an older revision
        # update the authorship information and mark both paragraphs as matched.
        if ((not matched_curr) and (hash_curr in paragraphs_ht)):
            for paragraph_prev in paragraphs_ht[hash_curr]:
                if (not paragraph_prev.matched):
                    matched_curr = True
                    paragraph_prev.matched = True
                    matched_paragraphs_prev.append(paragraph_prev)

                    # TODO: added this (CHECK).
                    for hash_sentence_prev in paragraph_prev.sentences.keys():
                        for sentence_prev in paragraph_prev.sentences[
                                hash_sentence_prev]:
                            sentence_prev.matched = True
                            for word_prev in sentence_prev.words:
                                word_prev.matched = True
                                word_prev.used.append(
                                    revision_curr.wikipedia_id)

                                if (revision_prev.wikipedia_id
                                        not in word_prev.used):
                                    word_prev.freq.append(
                                        revision_curr.wikipedia_id)

                                # Revert: reintroducing something that somebody else deleted,
                                # (and was not used in the previous revision)
                                if (revision_prev.wikipedia_id
                                        not in word_prev.used):
                                    #if (revision_curr.wikipedia_id == 11):
                                    #    print "Revert in 11", word_prev.value, word_prev.deleted, relation.revert

                                    for elem in word_prev.deleted:
                                        if (elem in revisions.keys()):
                                            if (revisions[elem].
                                                    contributor_name !=
                                                    revision_curr.
                                                    contributor_name):
                                                if (elem in relation.revert.
                                                        keys()):
                                                    relation.revert.update({
                                                        elem:
                                                        relation.revert[elem] +
                                                        1
                                                    })
                                                else:
                                                    relation.revert.update(
                                                        {elem: 1})
                                            else:
                                                if (elem in relation.
                                                        self_revert.keys()):
                                                    relation.self_revert.update(
                                                        {
                                                            elem:
                                                            relation.
                                                            self_revert[elem] +
                                                            1
                                                        })
                                                else:
                                                    relation.self_revert.update(
                                                        {elem: 1})

                                if (revision_prev.wikipedia_id
                                        not in word_prev.used):
                                    if (elem in revisions.keys()):
                                        if (revisions[word_prev.revision].
                                                contributor_name !=
                                                revision_curr.contributor_name
                                            ):
                                            if (word_prev.revision in relation.
                                                    reintroduced.keys()):
                                                relation.reintroduced.update({
                                                    word_prev.revision:
                                                    relation.reintroduced[
                                                        word_prev.revision] + 1
                                                })
                                            else:
                                                relation.reintroduced.update(
                                                    {word_prev.revision: 1})
                                        else:
                                            if (word_prev.revision in relation.
                                                    self_reintroduced.keys()):
                                                relation.self_reintroduced.update(
                                                    {
                                                        word_prev.revision:
                                                        relation.
                                                        self_reintroduced[
                                                            word_prev.revision]
                                                        + 1
                                                    })
                                            else:
                                                relation.self_reintroduced.update(
                                                    {word_prev.revision: 1})

                    # Add paragraph to current revision.
                    if (hash_curr in revision_curr.paragraphs.keys()):
                        revision_curr.paragraphs[
                            paragraph_prev.hash_value].append(paragraph_prev)
                        revision_curr.ordered_paragraphs.append(
                            paragraph_prev.hash_value)
                    else:
                        revision_curr.paragraphs.update(
                            {paragraph_prev.hash_value: [paragraph_prev]})
                        revision_curr.ordered_paragraphs.append(
                            paragraph_prev.hash_value)

                    break

        # If the paragraph did not match with previous revisions,
        # add to container of unmatched paragraphs for further analysis.
        if (not matched_curr):
            paragraph_curr = Paragraph()
            paragraph_curr.hash_value = Text.calculateHash(paragraph)
            paragraph_curr.value = paragraph

            revision_curr.ordered_paragraphs.append(paragraph_curr.hash_value)

            if (paragraph_curr.hash_value in revision_curr.paragraphs.keys()):
                revision_curr.paragraphs[paragraph_curr.hash_value].append(
                    paragraph_curr)
            else:
                revision_curr.paragraphs.update(
                    {paragraph_curr.hash_value: [paragraph_curr]})

            unmatched_paragraphs_curr.append(paragraph_curr)

    # Identify unmatched paragraphs in previous revision for further analysis.
    for paragraph_prev_hash in revision_prev.ordered_paragraphs:
        for paragraph_prev in revision_prev.paragraphs[paragraph_prev_hash]:
            if (not paragraph_prev.matched):
                unmatched_paragraphs_prev.append(paragraph_prev)

    return (unmatched_paragraphs_curr, unmatched_paragraphs_prev,
            matched_paragraphs_prev)
Exemple #4
0
def analyseParagraphsInRevision(revision_curr, revision_prev, text_curr):

    # Containers for unmatched and matched paragraphs.
    unmatched_paragraphs_curr = []
    unmatched_paragraphs_prev = []
    matched_paragraphs_prev = []

    # Split the text of the current into paragraphs.
    paragraphs = Text.splitIntoParagraphs(text_curr)

    # Iterate over the paragraphs of the current version.
    for paragraph in paragraphs:

        # Build Paragraph structure and calculate hash value.
        paragraph = paragraph.strip()
        hash_curr = Text.calculateHash(paragraph)
        matched_curr = False

        # If the paragraph is in the previous revision,
        # update the authorship information and mark both paragraphs as matched (also in HT).
        if (hash_curr in revision_prev.ordered_paragraphs):

            for paragraph_prev in revision_prev.paragraphs[hash_curr]:
                if (not paragraph_prev.matched):
                    matched_curr = True
                    paragraph_prev.matched = True
                    matched_paragraphs_prev.append(paragraph_prev)

                    # TODO: added this (CHECK).
                    for hash_sentence_prev in paragraph_prev.sentences.keys():
                        for sentence_prev in paragraph_prev.sentences[
                                hash_sentence_prev]:
                            sentence_prev.matched = True
                            for word_prev in sentence_prev.words:
                                word_prev.matched = True

                    # Add paragraph to current revision.
                    if (hash_curr in revision_curr.paragraphs.keys()):
                        revision_curr.paragraphs[
                            paragraph_prev.hash_value].append(paragraph_prev)
                        revision_curr.ordered_paragraphs.append(
                            paragraph_prev.hash_value)
                    else:
                        revision_curr.paragraphs.update(
                            {paragraph_prev.hash_value: [paragraph_prev]})
                        revision_curr.ordered_paragraphs.append(
                            paragraph_prev.hash_value)

                    break

        # If the paragraph is not in the previous revision, but it is in an older revision
        # update the authorship information and mark both paragraphs as matched.
        if ((not matched_curr) and (hash_curr in paragraphs_ht)):
            for paragraph_prev in paragraphs_ht[hash_curr]:
                if (not paragraph_prev.matched):
                    matched_curr = True
                    paragraph_prev.matched = True
                    matched_paragraphs_prev.append(paragraph_prev)

                    # TODO: added this (CHECK).
                    for hash_sentence_prev in paragraph_prev.sentences.keys():
                        for sentence_prev in paragraph_prev.sentences[
                                hash_sentence_prev]:
                            sentence_prev.matched = True
                            for word_prev in sentence_prev.words:
                                word_prev.matched = True

                    # Add paragraph to current revision.
                    if (hash_curr in revision_curr.paragraphs.keys()):
                        revision_curr.paragraphs[
                            paragraph_prev.hash_value].append(paragraph_prev)
                        revision_curr.ordered_paragraphs.append(
                            paragraph_prev.hash_value)
                    else:
                        revision_curr.paragraphs.update(
                            {paragraph_prev.hash_value: [paragraph_prev]})
                        revision_curr.ordered_paragraphs.append(
                            paragraph_prev.hash_value)

                    break

        # If the paragraph did not match with previous revisions,
        # add to container of unmatched paragraphs for further analysis.
        if (not matched_curr):
            paragraph_curr = Paragraph()
            paragraph_curr.hash_value = Text.calculateHash(paragraph)
            paragraph_curr.value = paragraph

            revision_curr.ordered_paragraphs.append(paragraph_curr.hash_value)

            if (paragraph_curr.hash_value in revision_curr.paragraphs.keys()):
                revision_curr.paragraphs[paragraph_curr.hash_value].append(
                    paragraph_curr)
            else:
                revision_curr.paragraphs.update(
                    {paragraph_curr.hash_value: [paragraph_curr]})

            unmatched_paragraphs_curr.append(paragraph_curr)

    # Identify unmatched paragraphs in previous revision for further analysis.
    for paragraph_prev_hash in revision_prev.ordered_paragraphs:
        for paragraph_prev in revision_prev.paragraphs[paragraph_prev_hash]:
            if (not paragraph_prev.matched):
                unmatched_paragraphs_prev.append(paragraph_prev)

    return (unmatched_paragraphs_curr, unmatched_paragraphs_prev,
            matched_paragraphs_prev)