Python Text.calculateHash Examples

Programming Language: Python

Namespace/Package Name: structures

Class/Type: Text

Method/Function: calculateHash

Examples at hotexamples.com: 14

Python Text.calculateHash - 14 examples found. These are the top rated real world Python examples of structures.Text.calculateHash extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

calculateHash(7)

splitIntoWords(4)

Text(2)

computeAvgWordFreq(2)

splitIntoParagraphs(2)

splitIntoSentences(2)

Example #1

Show file

File: Wikiwho.py Project: skasberger/wikiwho

def analyseArticle(file_name):
    
    # Container of revisions.
    revisions = {}
    
    # Revisions to compare.
    revision_curr = Revision()
    revision_prev = Revision()
    text_curr = None

    # Access the file.
    dumpIterator = dump.Iterator(file_name)
    
    # Iterate over the pages.
    for page in dumpIterator.readPages():
        i = 0
        
        # Iterate over revisions of the article.
        for revision in page.readRevisions():
            vandalism = False
            
            # Update the information about the previous revision.
            revision_prev = revision_curr
            
            if (revision.getSha1() == None):
                revision.setSha1(Text.calculateHash(revision.getText().encode("utf-8")))
            
            if (revision.getSha1() in spam):
                vandalism = True
            
            #TODO: SPAM detection: DELETION
            if (revision.getComment()!= None and revision.getComment().find(FLAG) > 0):
                pass
            else:
                if (revision_prev.length > PREVIOUS_LENGTH) and (len(revision.getText()) < CURR_LENGTH) and (((len(revision.getText())-revision_prev.length)/float(revision_prev.length)) <= CHANGE_PERCENTAGE):
                    vandalism = True
                    revision_curr = revision_prev
            
            #if (vandalism):
                #print "---------------------------- FLAG 1"
                #print revision.getId()
                #print revision.getText()           
                #print
            
            if (not vandalism):
                # Information about the current revision.
                revision_curr = Revision()
                revision_curr.id = i
                revision_curr.wikipedia_id = int(revision.getId())
                revision_curr.length = len(revision.getText())
                
                # Some revisions don't have contributor.
                if (revision.getContributor() != None):
                    revision_curr.contributor_id = revision.getContributor().getId()
                    revision_curr.contributor_name = revision.getContributor().getUsername()
                else:
                    revision_curr.contributor_id = 'Not Available'
                    revision_curr.contribur_name = 'Not Available'
                
                # Content within the revision.
                text_curr = revision.getText().encode('utf-8')
                text_curr = text_curr.lower()
                revision_curr.content = text_curr 
                             
                # Perform comparison.
                vandalism = determineAuthorship(revision_curr, revision_prev, text_curr)
                
            
                if (not vandalism):
                    # Add the current revision with all the information.
                    revisions.update({revision_curr.wikipedia_id : revision_curr})
                    # Update the fake revision id.
                    i = i+1
                        
                else:
                    #print "---------------------------- FLAG 2"
                    #print revision.getId()
                    #print revision.getText()
                    #print
                    revision_curr = revision_prev
                    spam.append(revision.getSha1())
           
    
    return revisions

Example #2

Show file

File: Wikiwho.py Project: skasberger/wikiwho

def analyseParagraphsInRevision(revision_curr, revision_prev, text_curr):

    # Containers for unmatched and matched paragraphs.
    unmatched_paragraphs_curr = []
    unmatched_paragraphs_prev = []
    matched_paragraphs_prev = []
    
    # Split the text of the current into paragraphs.
    paragraphs = Text.splitIntoParagraphs(text_curr)
    
    # Iterate over the paragraphs of the current version.
    for paragraph in paragraphs:
        
        # Build Paragraph structure and calculate hash value.
        paragraph = paragraph.strip()
        hash_curr = Text.calculateHash(paragraph)
        matched_curr = False
                    
        # If the paragraph is in the previous revision, 
        # update the authorship information and mark both paragraphs as matched (also in HT).
        if (hash_curr in revision_prev.ordered_paragraphs):

            for paragraph_prev in revision_prev.paragraphs[hash_curr]:
                if (not paragraph_prev.matched):
                    matched_curr = True 
                    paragraph_prev.matched = True
                    matched_paragraphs_prev.append(paragraph_prev)
                    
                    # TODO: added this (CHECK).
                    for hash_sentence_prev in paragraph_prev.sentences.keys():
                        for sentence_prev in paragraph_prev.sentences[hash_sentence_prev]:
                            sentence_prev.matched = True
                            for word_prev in sentence_prev.words:
                                word_prev.matched = True
                    
                    # Add paragraph to current revision.
                    if (hash_curr in revision_curr.paragraphs.keys()):
                        revision_curr.paragraphs[paragraph_prev.hash_value].append(paragraph_prev)
                        revision_curr.ordered_paragraphs.append(paragraph_prev.hash_value)
                    else:
                        revision_curr.paragraphs.update({paragraph_prev.hash_value : [paragraph_prev]})
                        revision_curr.ordered_paragraphs.append(paragraph_prev.hash_value)

                    break

                    
        # If the paragraph is not in the previous revision, but it is in an older revision
        # update the authorship information and mark both paragraphs as matched. 
        if ((not matched_curr) and (hash_curr in paragraphs_ht)):
            for paragraph_prev in paragraphs_ht[hash_curr]:
                if (not paragraph_prev.matched):
                    matched_curr = True
                    paragraph_prev.matched = True
                    matched_paragraphs_prev.append(paragraph_prev)
                    
                    # TODO: added this (CHECK).
                    for hash_sentence_prev in paragraph_prev.sentences.keys():
                        for sentence_prev in paragraph_prev.sentences[hash_sentence_prev]:
                            sentence_prev.matched = True
                            for word_prev in sentence_prev.words:
                                word_prev.matched = True

                    
                    # Add paragraph to current revision.
                    if (hash_curr in revision_curr.paragraphs.keys()):
                        revision_curr.paragraphs[paragraph_prev.hash_value].append(paragraph_prev)
                        revision_curr.ordered_paragraphs.append(paragraph_prev.hash_value)
                    else:
                        revision_curr.paragraphs.update({paragraph_prev.hash_value : [paragraph_prev]})
                        revision_curr.ordered_paragraphs.append(paragraph_prev.hash_value)
                    
                    break
            
        # If the paragraph did not match with previous revisions,
        # add to container of unmatched paragraphs for further analysis.
        if (not matched_curr):
            paragraph_curr = Paragraph()
            paragraph_curr.hash_value = Text.calculateHash(paragraph)
            paragraph_curr.value = paragraph

            revision_curr.ordered_paragraphs.append(paragraph_curr.hash_value)
            
            if (paragraph_curr.hash_value in revision_curr.paragraphs.keys()):
                revision_curr.paragraphs[paragraph_curr.hash_value].append(paragraph_curr)
            else:
                revision_curr.paragraphs.update({paragraph_curr.hash_value : [paragraph_curr]})
            
            unmatched_paragraphs_curr.append(paragraph_curr)  
                  
     
    # Identify unmatched paragraphs in previous revision for further analysis.        
    for paragraph_prev_hash in revision_prev.ordered_paragraphs:
        for paragraph_prev in revision_prev.paragraphs[paragraph_prev_hash]:
            if (not paragraph_prev.matched):
                unmatched_paragraphs_prev.append(paragraph_prev)

    return (unmatched_paragraphs_curr, unmatched_paragraphs_prev, matched_paragraphs_prev)

Example #3

Show file

File: Wikiwho.py Project: skasberger/wikiwho

def analyseSentencesInParagraphs(unmatched_paragraphs_curr, unmatched_paragraphs_prev, revision_curr):
    
    # Containers for unmatched and matched sentences.
    unmatched_sentences_curr = []
    unmatched_sentences_prev = []
    matched_sentences_prev = []
    total_sentences = 0
    

    # Iterate over the unmatched paragraphs of the current revision.
    for paragraph_curr in unmatched_paragraphs_curr:
        
        # Split the current paragraph into sentences.
        sentences = Text.splitIntoSentences(paragraph_curr.value)

        # Iterate over the sentences of the current paragraph
        for sentence in sentences:
            
            # Create the Sentence structure.                
            sentence = sentence.strip()
            sentence = ' '.join(Text.splitIntoWords(sentence))
            hash_curr = Text.calculateHash(sentence)
            matched_curr = False
            total_sentences = total_sentences + 1
             
            
            # Iterate over the unmatched paragraphs from the previous revision.
            for paragraph_prev in unmatched_paragraphs_prev:
                if (hash_curr in paragraph_prev.sentences.keys()):
                    for sentence_prev in paragraph_prev.sentences[hash_curr]:
                        
                        if (not sentence_prev.matched): 
                            
                            matched_one = False
                            matched_all = True
                            for word_prev in sentence_prev.words:
                                if (word_prev.matched):
                                    matched_one = True
                                else:
                                    matched_all = False
                                    
                            if not(matched_one):
                                sentence_prev.matched = True
                                matched_curr = True
                                matched_sentences_prev.append(sentence_prev)
                            
                                # TODO: CHECK this
                                for word_prev in sentence_prev.words:
                                    word_prev.matched = True
                                
                                # Add the sentence information to the paragraph.
                                if (hash_curr in paragraph_curr.sentences.keys()):
                                    paragraph_curr.sentences[hash_curr].append(sentence_prev)
                                    paragraph_curr.ordered_sentences.append(sentence_prev.hash_value)
                                else:
                                    paragraph_curr.sentences.update({sentence_prev.hash_value : [sentence_prev]})
                                    paragraph_curr.ordered_sentences.append(sentence_prev.hash_value) 
                                break
                            elif (matched_all):
                                sentence_prev.matched = True
                                matched_sentences_prev.append(sentence_prev)
                                   
                    if (matched_curr):
                        break
                    
                        
            # Iterate over the hash table of sentences from old revisions.    
            if ((not matched_curr) and (hash_curr in sentences_ht.keys())):
                for sentence_prev in sentences_ht[hash_curr]:
                    if (not sentence_prev.matched):
                        matched_one = False
                        matched_all = True
                        for word_prev in sentence_prev.words:
                            if (word_prev.matched):
                                matched_one = True
                            else:
                                matched_all = False
                            
                        if not(matched_one):
                                    
                            sentence_prev.matched = True
                            matched_curr = True
                            matched_sentences_prev.append(sentence_prev)
                        
                            # TODO: CHECK this
                            for word_prev in sentence_prev.words:
                                word_prev.matched = True
                        
                            # Add the sentence information to the paragraph.
                            if (hash_curr in paragraph_curr.sentences.keys()):
                                paragraph_curr.sentences[hash_curr].append(sentence_prev)
                                paragraph_curr.ordered_sentences.append(sentence_prev.hash_value)
                            else:
                                paragraph_curr.sentences.update({sentence_prev.hash_value : [sentence_prev]})
                                paragraph_curr.ordered_sentences.append(sentence_prev.hash_value) 
                            break
                        elif (matched_all):
                            sentence_prev.matched = True
                            matched_sentences_prev.append(sentence_prev)
                            
            
            # If the sentence did not match, then include in the container of unmatched sentences for further analysis.    
            if (not matched_curr):
                sentence_curr = Sentence()
                sentence_curr.value = sentence
                sentence_curr.hash_value = hash_curr
                
                paragraph_curr.ordered_sentences.append(sentence_curr.hash_value)
                if (sentence_curr.hash_value in paragraph_curr.sentences.keys()):
                    paragraph_curr.sentences[sentence_curr.hash_value].append(sentence_curr)
                else:
                    paragraph_curr.sentences.update({sentence_curr.hash_value : [sentence_curr]})
                
                unmatched_sentences_curr.append(sentence_curr)
            
    
    # Identify the unmatched sentences in the previous paragraph revision.            
    for paragraph_prev in unmatched_paragraphs_prev:
        for sentence_prev_hash in paragraph_prev.ordered_sentences:
            for sentence_prev in paragraph_prev.sentences[sentence_prev_hash]:
                if (not sentence_prev.matched):
                    unmatched_sentences_prev.append(sentence_prev)
                    sentence_prev.matched = True
                    matched_sentences_prev.append(sentence_prev)
                    
                
    return (unmatched_sentences_curr, unmatched_sentences_prev, matched_sentences_prev, total_sentences)

Example #4

Show file

def analyseArticle(file_name):

    # Container of relationships.
    relations = {}

    # Revisions to compare.
    revision_curr = Revision()
    revision_prev = Revision()
    text_curr = None

    # Access the file.
    dumpIterator = dump.Iterator(file_name)

    # Iterate over the pages.
    for page in dumpIterator.readPages():
        i = 0

        # Iterate over revisions of the article.
        for revision in page.readRevisions():
            vandalism = False

            #print "processing rev", revision.getId()

            # Update the information about the previous revision.
            revision_prev = revision_curr

            if (revision.getSha1() == None):
                revision.setSha1(
                    Text.calculateHash(revision.getText().encode("utf-8")))

            if (revision.getSha1() in spam):
                vandalism = True

            #TODO: SPAM detection: DELETION
            if (revision.getComment() != None
                    and revision.getComment().find(FLAG) > 0):
                pass
            else:
                if (revision_prev.length > PREVIOUS_LENGTH) and (len(
                        revision.getText()) < CURR_LENGTH) and ((
                            (len(revision.getText()) - revision_prev.length) /
                            float(revision_prev.length)) <= CHANGE_PERCENTAGE):
                    vandalism = True
                    revision_curr = revision_prev

            #if (vandalism):
            #print "---------------------------- FLAG 1"
            #print "SPAM", revision.getId()
            #print revision.getText()
            #print

            if (not vandalism):
                # Information about the current revision.
                revision_curr = Revision()
                revision_curr.id = i
                revision_curr.wikipedia_id = int(revision.getId())
                revision_curr.length = len(revision.getText())
                revision_curr.timestamp = revision.getTimestamp()
                revision_curr.comment = revision.getComment()

                # Relation of the current relation.
                relation = Relation()
                relation.revision = int(revision.getId())
                relation.length = len(revision.getText())

                # Some revisions don't have contributor.
                if (revision.getContributor() != None):
                    revision_curr.contributor_id = revision.getContributor(
                    ).getId()
                    revision_curr.contributor_name = revision.getContributor(
                    ).getUsername().encode('utf-8')
                    relation.author = revision.getContributor().getUsername(
                    ).encode('utf-8')
                else:
                    revision_curr.contributor_id = 'Not Available ' + revision.getId(
                    )
                    revision_curr.contribur_name = 'Not Available ' + revision.getId(
                    )
                    relation.author = 'Not Available ' + revision.getId()

                # Content within the revision.
                text_curr = revision.getText().encode('utf-8')
                text_curr = text_curr.lower()
                revision_curr.content = text_curr

                # Perform comparison.
                vandalism = determineAuthorship(revision_curr, revision_prev,
                                                text_curr, relation)

                if (not vandalism):
                    #print "NOT SPAM", revision.getId()

                    # Add the current revision with all the information.
                    revisions.update(
                        {revision_curr.wikipedia_id: revision_curr})
                    relations.update({revision_curr.wikipedia_id: relation})
                    revision_order.append((revision_curr.wikipedia_id, False))
                    # Update the fake revision id.
                    i = i + 1

                    # Calculate the number of tokens in the revision.
                    total = 0
                    for p in revision_curr.ordered_paragraphs:
                        for paragraph_curr in revision_curr.paragraphs[p]:
                            for hash_sentence_curr in paragraph_curr.sentences.keys(
                            ):
                                for sentence_curr in paragraph_curr.sentences[
                                        hash_sentence_curr]:
                                    total = total + len(sentence_curr.words)
                    revision_curr.total_tokens = total
                    relation.total_tokens = total

                else:
                    #print "---------------------------- FLAG 2"
                    #print "SPAM", revision.getId()
                    #print revision.getText()
                    #print
                    revision_order.append((revision_curr.wikipedia_id, True))
                    revision_curr = revision_prev
                    spam.append(revision.getSha1())

    return (revisions, revision_order, relations)

Example #5

Show file

def analyseSentencesInParagraphs(unmatched_paragraphs_curr,
                                 unmatched_paragraphs_prev, revision_curr,
                                 revision_prev, relation):

    # Containers for unmatched and matched sentences.
    unmatched_sentences_curr = []
    unmatched_sentences_prev = []
    matched_sentences_prev = []
    total_sentences = 0

    # Iterate over the unmatched paragraphs of the current revision.
    for paragraph_curr in unmatched_paragraphs_curr:

        # Split the current paragraph into sentences.
        sentences = Text.splitIntoSentences(paragraph_curr.value)

        # Iterate over the sentences of the current paragraph
        for sentence in sentences:

            # Create the Sentence structure.
            sentence = sentence.strip()
            sentence = ' '.join(Text.splitIntoWords(sentence))
            hash_curr = Text.calculateHash(sentence)
            matched_curr = False
            total_sentences = total_sentences + 1

            # Iterate over the unmatched paragraphs from the previous revision.
            for paragraph_prev in unmatched_paragraphs_prev:
                if (hash_curr in paragraph_prev.sentences.keys()):
                    for sentence_prev in paragraph_prev.sentences[hash_curr]:

                        if (not sentence_prev.matched):

                            matched_one = False
                            matched_all = True
                            for word_prev in sentence_prev.words:

                                if (word_prev.matched):
                                    matched_one = True
                                else:
                                    matched_all = False

                            if not (matched_one):
                                sentence_prev.matched = True
                                matched_curr = True
                                matched_sentences_prev.append(sentence_prev)

                                # TODO: CHECK this
                                for word_prev in sentence_prev.words:
                                    word_prev.matched = True
                                    word_prev.used.append(
                                        revision_curr.wikipedia_id)

                                    #if (word_prev.revision in relation.reintroduced.keys()):
                                    #    relation.reintroduced.update({word_prev.revision : relation.reintroduced[word_prev.revision] + 1 })
                                    #else:
                                    #    relation.reintroduced.update({word_prev.revision : 1 })

                                # Add the sentence information to the paragraph.
                                if (hash_curr
                                        in paragraph_curr.sentences.keys()):
                                    paragraph_curr.sentences[hash_curr].append(
                                        sentence_prev)
                                    paragraph_curr.ordered_sentences.append(
                                        sentence_prev.hash_value)
                                else:
                                    paragraph_curr.sentences.update({
                                        sentence_prev.hash_value:
                                        [sentence_prev]
                                    })
                                    paragraph_curr.ordered_sentences.append(
                                        sentence_prev.hash_value)
                                break
                            elif (matched_all):

                                sentence_prev.matched = True
                                matched_sentences_prev.append(sentence_prev)

                    if (matched_curr):
                        break

            # Iterate over the hash table of sentences from old revisions.
            if ((not matched_curr) and (hash_curr in sentences_ht.keys())):
                for sentence_prev in sentences_ht[hash_curr]:
                    if (not sentence_prev.matched):
                        matched_one = False
                        matched_all = True
                        for word_prev in sentence_prev.words:
                            if (word_prev.matched):
                                matched_one = True
                            else:
                                matched_all = False

                        if not (matched_one):

                            sentence_prev.matched = True
                            matched_curr = True
                            matched_sentences_prev.append(sentence_prev)

                            # TODO: CHECK this
                            for word_prev in sentence_prev.words:
                                word_prev.matched = True
                                word_prev.used.append(
                                    revision_curr.wikipedia_id)

                                if (revision_prev.wikipedia_id
                                        not in word_prev.used):
                                    word_prev.freq.append(
                                        revision_curr.wikipedia_id)

                                # Revert: reintroducing something that somebody else deleted
                                if (revision_prev.wikipedia_id
                                        not in word_prev.used):
                                    for elem in word_prev.deleted:
                                        #if (revision_curr.wikipedia_id == 11):
                                        #    print "Revert in 11", word_prev.value, word_prev.deleted, relation.revert
                                        if (elem in revisions.keys()):
                                            if (revisions[elem].
                                                    contributor_name !=
                                                    revision_curr.
                                                    contributor_name):
                                                if (elem in relation.revert.
                                                        keys()):
                                                    relation.revert.update({
                                                        elem:
                                                        relation.revert[elem] +
                                                        1
                                                    })
                                                else:
                                                    relation.revert.update(
                                                        {elem: 1})
                                            else:
                                                if (elem in relation.
                                                        self_revert.keys()):
                                                    relation.self_revert.update(
                                                        {
                                                            elem:
                                                            relation.
                                                            self_revert[elem] +
                                                            1
                                                        })
                                                else:
                                                    relation.self_revert.update(
                                                        {elem: 1})
                                #print "relation.revert", word_prev.value, word_prev.deleted, relation.revert, revision_curr.wikipedia_id

                                if (revision_prev.wikipedia_id
                                        not in word_prev.used):
                                    if (elem in revisions.keys()):
                                        if (revisions[word_prev.revision].
                                                contributor_name !=
                                                revision_curr.contributor_name
                                            ):
                                            if (word_prev.revision in relation.
                                                    reintroduced.keys()):
                                                relation.reintroduced.update({
                                                    word_prev.revision:
                                                    relation.reintroduced[
                                                        word_prev.revision] + 1
                                                })
                                            else:
                                                relation.reintroduced.update(
                                                    {word_prev.revision: 1})
                                        else:
                                            if (word_prev.revision in relation.
                                                    self_reintroduced.keys()):
                                                relation.self_reintroduced.update(
                                                    {
                                                        word_prev.revision:
                                                        relation.
                                                        self_reintroduced[
                                                            word_prev.revision]
                                                        + 1
                                                    })
                                            else:
                                                relation.self_reintroduced.update(
                                                    {word_prev.revision: 1})

                            # Add the sentence information to the paragraph.
                            if (hash_curr in paragraph_curr.sentences.keys()):
                                paragraph_curr.sentences[hash_curr].append(
                                    sentence_prev)
                                paragraph_curr.ordered_sentences.append(
                                    sentence_prev.hash_value)
                            else:
                                paragraph_curr.sentences.update({
                                    sentence_prev.hash_value: [sentence_prev]
                                })
                                paragraph_curr.ordered_sentences.append(
                                    sentence_prev.hash_value)
                            break
                        elif (matched_all):
                            sentence_prev.matched = True
                            matched_sentences_prev.append(sentence_prev)

            # If the sentence did not match, then include in the container of unmatched sentences for further analysis.
            if (not matched_curr):
                sentence_curr = Sentence()
                sentence_curr.value = sentence
                sentence_curr.hash_value = hash_curr

                paragraph_curr.ordered_sentences.append(
                    sentence_curr.hash_value)
                if (sentence_curr.hash_value
                        in paragraph_curr.sentences.keys()):
                    paragraph_curr.sentences[sentence_curr.hash_value].append(
                        sentence_curr)
                else:
                    paragraph_curr.sentences.update(
                        {sentence_curr.hash_value: [sentence_curr]})

                unmatched_sentences_curr.append(sentence_curr)

    # Identify the unmatched sentences in the previous paragraph revision.
    for paragraph_prev in unmatched_paragraphs_prev:
        for sentence_prev_hash in paragraph_prev.ordered_sentences:
            for sentence_prev in paragraph_prev.sentences[sentence_prev_hash]:
                if (not sentence_prev.matched):
                    unmatched_sentences_prev.append(sentence_prev)
                    sentence_prev.matched = True
                    matched_sentences_prev.append(sentence_prev)

    return (unmatched_sentences_curr, unmatched_sentences_prev,
            matched_sentences_prev, total_sentences)

Example #6

Show file

File: WikiwhoRelationships.py Project: rmarshasatx/whovis

def analyseArticle(file_name):
    
    
    
    # Container of relationships.
    relations = {}
    
    # Revisions to compare.
    revision_curr = Revision()
    revision_prev = Revision()
    text_curr = None

    # Access the file.
    dumpIterator = dump.Iterator(file_name)
    
    # Iterate over the pages.
    for page in dumpIterator.readPages():
        i = 0
        
        # Iterate over revisions of the article.
        for revision in page.readRevisions():
            vandalism = False
            
            #print "processing rev", revision.getId()
            
            # Update the information about the previous revision.
            revision_prev = revision_curr
            
            if (revision.getSha1() == None):
                revision.setSha1(Text.calculateHash(revision.getText().encode("utf-8")))
            
            if (revision.getSha1() in spam):
                vandalism = True
            
            #TODO: SPAM detection: DELETION
            if (revision.getComment()!= None and revision.getComment().find(FLAG) > 0):
                pass
            else:
                if (revision_prev.length > PREVIOUS_LENGTH) and (len(revision.getText()) < CURR_LENGTH) and (((len(revision.getText())-revision_prev.length)/float(revision_prev.length)) <= CHANGE_PERCENTAGE):
                    vandalism = True
                    revision_curr = revision_prev
            
            #if (vandalism):
                #print "---------------------------- FLAG 1"
                #print "SPAM", revision.getId()
                #print revision.getText()           
                #print
            
            if (not vandalism):
                # Information about the current revision.
                revision_curr = Revision()
                revision_curr.id = i
                revision_curr.wikipedia_id = int(revision.getId())
                revision_curr.length = len(revision.getText())
                revision_curr.timestamp = revision.getTimestamp()
                revision_curr.comment = revision.getComment()
                
                # Relation of the current relation.
                relation = Relation()
                relation.revision = int(revision.getId())
                relation.length = len(revision.getText())
                
                # Some revisions don't have contributor.
                if (revision.getContributor() != None):
                    revision_curr.contributor_id = revision.getContributor().getId()
                    revision_curr.contributor_name = revision.getContributor().getUsername().encode('utf-8')
                    relation.author = revision.getContributor().getUsername().encode('utf-8')
                else:
                    revision_curr.contributor_id = 'Not Available ' + revision.getId()
                    revision_curr.contribur_name = 'Not Available ' + revision.getId()
                    relation.author = 'Not Available ' + revision.getId()
                
                # Content within the revision.
                text_curr = revision.getText().encode('utf-8')
                text_curr = text_curr.lower()
                revision_curr.content = text_curr 
                             
                # Perform comparison.
                vandalism = determineAuthorship(revision_curr, revision_prev, text_curr, relation)
                
            
                if (not vandalism):
                    #print "NOT SPAM", revision.getId()
                    
                    # Add the current revision with all the information.
                    revisions.update({revision_curr.wikipedia_id : revision_curr})
                    relations.update({revision_curr.wikipedia_id : relation})
                    revision_order.append((revision_curr.wikipedia_id, False))
                    # Update the fake revision id.
                    i = i+1
                    
                    # Calculate the number of tokens in the revision.
                    total = 0
                    for p in revision_curr.ordered_paragraphs:
                        for paragraph_curr in revision_curr.paragraphs[p]:
                            for hash_sentence_curr in paragraph_curr.sentences.keys():
                                for sentence_curr in paragraph_curr.sentences[hash_sentence_curr]:
                                    total = total + len(sentence_curr.words)
                    revision_curr.total_tokens = total
                    relation.total_tokens = total
                    
                        
                        
                else:
                    #print "---------------------------- FLAG 2"
                    #print "SPAM", revision.getId()
                    #print revision.getText()
                    #print
                    revision_order.append((revision_curr.wikipedia_id, True))
                    revision_curr = revision_prev
                    spam.append(revision.getSha1())
                    
           
    
    return (revisions, revision_order, relations)

Example #7

Show file

def analyseParagraphsInRevision(revision_curr, revision_prev, text_curr,
                                relation):

    # Containers for unmatched and matched paragraphs.
    unmatched_paragraphs_curr = []
    unmatched_paragraphs_prev = []
    matched_paragraphs_prev = []

    # Split the text of the current into paragraphs.
    paragraphs = Text.splitIntoParagraphs(text_curr)

    # Iterate over the paragraphs of the current version.
    for paragraph in paragraphs:

        # Build Paragraph structure and calculate hash value.
        paragraph = paragraph.strip()
        hash_curr = Text.calculateHash(paragraph)
        matched_curr = False

        # If the paragraph is in the previous revision,
        # update the authorship information and mark both paragraphs as matched (also in HT).
        if (hash_curr in revision_prev.ordered_paragraphs):

            for paragraph_prev in revision_prev.paragraphs[hash_curr]:
                if (not paragraph_prev.matched):
                    matched_curr = True
                    paragraph_prev.matched = True
                    matched_paragraphs_prev.append(paragraph_prev)

                    # TODO: added this (CHECK).
                    for hash_sentence_prev in paragraph_prev.sentences.keys():
                        for sentence_prev in paragraph_prev.sentences[
                                hash_sentence_prev]:
                            sentence_prev.matched = True
                            for word_prev in sentence_prev.words:
                                word_prev.matched = True
                                word_prev.used.append(
                                    revision_curr.wikipedia_id)

                                #if (word_prev.revision in relation.reintroduced.keys()):
                                #    relation.reintroduced.update({word_prev.revision : relation.reintroduced[word_prev.revision] + 1 })
                                #else:
                                #    relation.reintroduced.update({word_prev.revision : 1 })

                    # Add paragraph to current revision.
                    if (hash_curr in revision_curr.paragraphs.keys()):
                        revision_curr.paragraphs[
                            paragraph_prev.hash_value].append(paragraph_prev)
                        revision_curr.ordered_paragraphs.append(
                            paragraph_prev.hash_value)
                    else:
                        revision_curr.paragraphs.update(
                            {paragraph_prev.hash_value: [paragraph_prev]})
                        revision_curr.ordered_paragraphs.append(
                            paragraph_prev.hash_value)

                    break

        # If the paragraph is not in the previous revision, but it is in an older revision
        # update the authorship information and mark both paragraphs as matched.
        if ((not matched_curr) and (hash_curr in paragraphs_ht)):
            for paragraph_prev in paragraphs_ht[hash_curr]:
                if (not paragraph_prev.matched):
                    matched_curr = True
                    paragraph_prev.matched = True
                    matched_paragraphs_prev.append(paragraph_prev)

                    # TODO: added this (CHECK).
                    for hash_sentence_prev in paragraph_prev.sentences.keys():
                        for sentence_prev in paragraph_prev.sentences[
                                hash_sentence_prev]:
                            sentence_prev.matched = True
                            for word_prev in sentence_prev.words:
                                word_prev.matched = True
                                word_prev.used.append(
                                    revision_curr.wikipedia_id)

                                if (revision_prev.wikipedia_id
                                        not in word_prev.used):
                                    word_prev.freq.append(
                                        revision_curr.wikipedia_id)

                                # Revert: reintroducing something that somebody else deleted,
                                # (and was not used in the previous revision)
                                if (revision_prev.wikipedia_id
                                        not in word_prev.used):
                                    #if (revision_curr.wikipedia_id == 11):
                                    #    print "Revert in 11", word_prev.value, word_prev.deleted, relation.revert

                                    for elem in word_prev.deleted:
                                        if (elem in revisions.keys()):
                                            if (revisions[elem].
                                                    contributor_name !=
                                                    revision_curr.
                                                    contributor_name):
                                                if (elem in relation.revert.
                                                        keys()):
                                                    relation.revert.update({
                                                        elem:
                                                        relation.revert[elem] +
                                                        1
                                                    })
                                                else:
                                                    relation.revert.update(
                                                        {elem: 1})
                                            else:
                                                if (elem in relation.
                                                        self_revert.keys()):
                                                    relation.self_revert.update(
                                                        {
                                                            elem:
                                                            relation.
                                                            self_revert[elem] +
                                                            1
                                                        })
                                                else:
                                                    relation.self_revert.update(
                                                        {elem: 1})

                                if (revision_prev.wikipedia_id
                                        not in word_prev.used):
                                    if (elem in revisions.keys()):
                                        if (revisions[word_prev.revision].
                                                contributor_name !=
                                                revision_curr.contributor_name
                                            ):
                                            if (word_prev.revision in relation.
                                                    reintroduced.keys()):
                                                relation.reintroduced.update({
                                                    word_prev.revision:
                                                    relation.reintroduced[
                                                        word_prev.revision] + 1
                                                })
                                            else:
                                                relation.reintroduced.update(
                                                    {word_prev.revision: 1})
                                        else:
                                            if (word_prev.revision in relation.
                                                    self_reintroduced.keys()):
                                                relation.self_reintroduced.update(
                                                    {
                                                        word_prev.revision:
                                                        relation.
                                                        self_reintroduced[
                                                            word_prev.revision]
                                                        + 1
                                                    })
                                            else:
                                                relation.self_reintroduced.update(
                                                    {word_prev.revision: 1})

                    # Add paragraph to current revision.
                    if (hash_curr in revision_curr.paragraphs.keys()):
                        revision_curr.paragraphs[
                            paragraph_prev.hash_value].append(paragraph_prev)
                        revision_curr.ordered_paragraphs.append(
                            paragraph_prev.hash_value)
                    else:
                        revision_curr.paragraphs.update(
                            {paragraph_prev.hash_value: [paragraph_prev]})
                        revision_curr.ordered_paragraphs.append(
                            paragraph_prev.hash_value)

                    break

        # If the paragraph did not match with previous revisions,
        # add to container of unmatched paragraphs for further analysis.
        if (not matched_curr):
            paragraph_curr = Paragraph()
            paragraph_curr.hash_value = Text.calculateHash(paragraph)
            paragraph_curr.value = paragraph

            revision_curr.ordered_paragraphs.append(paragraph_curr.hash_value)

            if (paragraph_curr.hash_value in revision_curr.paragraphs.keys()):
                revision_curr.paragraphs[paragraph_curr.hash_value].append(
                    paragraph_curr)
            else:
                revision_curr.paragraphs.update(
                    {paragraph_curr.hash_value: [paragraph_curr]})

            unmatched_paragraphs_curr.append(paragraph_curr)

    # Identify unmatched paragraphs in previous revision for further analysis.
    for paragraph_prev_hash in revision_prev.ordered_paragraphs:
        for paragraph_prev in revision_prev.paragraphs[paragraph_prev_hash]:
            if (not paragraph_prev.matched):
                unmatched_paragraphs_prev.append(paragraph_prev)

    return (unmatched_paragraphs_curr, unmatched_paragraphs_prev,
            matched_paragraphs_prev)

Example #8

Show file

File: WikiwhoRelationships.py Project: rmarshasatx/whovis

def analyseParagraphsInRevision(revision_curr, revision_prev, text_curr, relation):

    # Containers for unmatched and matched paragraphs.
    unmatched_paragraphs_curr = []
    unmatched_paragraphs_prev = []
    matched_paragraphs_prev = []
    
    # Split the text of the current into paragraphs.
    paragraphs = Text.splitIntoParagraphs(text_curr)
    
    # Iterate over the paragraphs of the current version.
    for paragraph in paragraphs:
        
        # Build Paragraph structure and calculate hash value.
        paragraph = paragraph.strip()
        hash_curr = Text.calculateHash(paragraph)
        matched_curr = False
                    
        # If the paragraph is in the previous revision, 
        # update the authorship information and mark both paragraphs as matched (also in HT).
        if (hash_curr in revision_prev.ordered_paragraphs):

            for paragraph_prev in revision_prev.paragraphs[hash_curr]:
                if (not paragraph_prev.matched):
                    matched_curr = True 
                    paragraph_prev.matched = True
                    matched_paragraphs_prev.append(paragraph_prev)
                    
                    # TODO: added this (CHECK).
                    for hash_sentence_prev in paragraph_prev.sentences.keys():
                        for sentence_prev in paragraph_prev.sentences[hash_sentence_prev]:
                            sentence_prev.matched = True
                            for word_prev in sentence_prev.words:
                                word_prev.matched = True
                                word_prev.used.append(revision_curr.wikipedia_id)
                                
                                #if (word_prev.revision in relation.reintroduced.keys()):
                                #    relation.reintroduced.update({word_prev.revision : relation.reintroduced[word_prev.revision] + 1 })
                                #else:
                                #    relation.reintroduced.update({word_prev.revision : 1 })
                    
                    # Add paragraph to current revision.
                    if (hash_curr in revision_curr.paragraphs.keys()):
                        revision_curr.paragraphs[paragraph_prev.hash_value].append(paragraph_prev)
                        revision_curr.ordered_paragraphs.append(paragraph_prev.hash_value)
                    else:
                        revision_curr.paragraphs.update({paragraph_prev.hash_value : [paragraph_prev]})
                        revision_curr.ordered_paragraphs.append(paragraph_prev.hash_value)

                    break

                    
        # If the paragraph is not in the previous revision, but it is in an older revision
        # update the authorship information and mark both paragraphs as matched. 
        if ((not matched_curr) and (hash_curr in paragraphs_ht)):
            for paragraph_prev in paragraphs_ht[hash_curr]:
                if (not paragraph_prev.matched):
                    matched_curr = True
                    paragraph_prev.matched = True
                    matched_paragraphs_prev.append(paragraph_prev)
                    
                    # TODO: added this (CHECK).
                    for hash_sentence_prev in paragraph_prev.sentences.keys():
                        for sentence_prev in paragraph_prev.sentences[hash_sentence_prev]:
                            sentence_prev.matched = True
                            for word_prev in sentence_prev.words:
                                word_prev.matched = True
                                word_prev.used.append(revision_curr.wikipedia_id)
                                
                                if (revision_prev.wikipedia_id not in word_prev.used):
                                    word_prev.freq.append(revision_curr.wikipedia_id)
                                
                                # Revert: reintroducing something that somebody else deleted, 
                                # (and was not used in the previous revision)
                                if (revision_prev.wikipedia_id not in word_prev.used):
                                    #if (revision_curr.wikipedia_id == 11):
                                    #    print "Revert in 11", word_prev.value, word_prev.deleted, relation.revert
                                    
                                    for elem in word_prev.deleted:
                                        if (elem in revisions.keys()):
                                            if (revisions[elem].contributor_name != revision_curr.contributor_name):
                                                if (elem in relation.revert.keys()):
                                                    relation.revert.update({elem : relation.revert[elem] + 1})
                                                else:
                                                    relation.revert.update({elem : 1})
                                            else:
                                                if (elem in relation.self_revert.keys()):
                                                    relation.self_revert.update({elem : relation.self_revert[elem] + 1})
                                                else:
                                                    relation.self_revert.update({elem : 1})
                                
                                 
                                if (revision_prev.wikipedia_id not in word_prev.used):
                                    if (elem in revisions.keys()):
                                        if (revisions[word_prev.revision].contributor_name != revision_curr.contributor_name):
                                            if (word_prev.revision in relation.reintroduced.keys()):
                                                relation.reintroduced.update({word_prev.revision : relation.reintroduced[word_prev.revision] + 1 })
                                            else:
                                                relation.reintroduced.update({word_prev.revision : 1 })
                                        else:
                                            if (word_prev.revision in relation.self_reintroduced.keys()):
                                                relation.self_reintroduced.update({word_prev.revision : relation.self_reintroduced[word_prev.revision] + 1})
                                            else:
                                                relation.self_reintroduced.update({word_prev.revision : 1})
                    
                    # Add paragraph to current revision.
                    if (hash_curr in revision_curr.paragraphs.keys()):
                        revision_curr.paragraphs[paragraph_prev.hash_value].append(paragraph_prev)
                        revision_curr.ordered_paragraphs.append(paragraph_prev.hash_value)
                    else:
                        revision_curr.paragraphs.update({paragraph_prev.hash_value : [paragraph_prev]})
                        revision_curr.ordered_paragraphs.append(paragraph_prev.hash_value)
                    
                    break
            
        # If the paragraph did not match with previous revisions,
        # add to container of unmatched paragraphs for further analysis.
        if (not matched_curr):
            paragraph_curr = Paragraph()
            paragraph_curr.hash_value = Text.calculateHash(paragraph)
            paragraph_curr.value = paragraph

            revision_curr.ordered_paragraphs.append(paragraph_curr.hash_value)
            
            if (paragraph_curr.hash_value in revision_curr.paragraphs.keys()):
                revision_curr.paragraphs[paragraph_curr.hash_value].append(paragraph_curr)
            else:
                revision_curr.paragraphs.update({paragraph_curr.hash_value : [paragraph_curr]})
            
            unmatched_paragraphs_curr.append(paragraph_curr)  
                  
     
    # Identify unmatched paragraphs in previous revision for further analysis.        
    for paragraph_prev_hash in revision_prev.ordered_paragraphs:
        for paragraph_prev in revision_prev.paragraphs[paragraph_prev_hash]:
            if (not paragraph_prev.matched):
                unmatched_paragraphs_prev.append(paragraph_prev)

    return (unmatched_paragraphs_curr, unmatched_paragraphs_prev, matched_paragraphs_prev)

Example #9

Show file

File: WikiwhoRelationships.py Project: rmarshasatx/whovis

def analyseSentencesInParagraphs(unmatched_paragraphs_curr, unmatched_paragraphs_prev, revision_curr, revision_prev, relation):
    
    # Containers for unmatched and matched sentences.
    unmatched_sentences_curr = []
    unmatched_sentences_prev = []
    matched_sentences_prev = []
    total_sentences = 0
    

    # Iterate over the unmatched paragraphs of the current revision.
    for paragraph_curr in unmatched_paragraphs_curr:
        
        # Split the current paragraph into sentences.
        sentences = Text.splitIntoSentences(paragraph_curr.value)

        # Iterate over the sentences of the current paragraph
        for sentence in sentences:
            
            # Create the Sentence structure.                
            sentence = sentence.strip()
            sentence = ' '.join(Text.splitIntoWords(sentence))
            hash_curr = Text.calculateHash(sentence)
            matched_curr = False
            total_sentences = total_sentences + 1
            
            
            # Iterate over the unmatched paragraphs from the previous revision.
            for paragraph_prev in unmatched_paragraphs_prev:
                if (hash_curr in paragraph_prev.sentences.keys()):
                    for sentence_prev in paragraph_prev.sentences[hash_curr]:
                        
                        if (not sentence_prev.matched): 
                            
                            matched_one = False
                            matched_all = True
                            for word_prev in sentence_prev.words:
                                
                                if (word_prev.matched):
                                    matched_one = True
                                else:
                                    matched_all = False
                                    
                            if not(matched_one):
                                sentence_prev.matched = True
                                matched_curr = True
                                matched_sentences_prev.append(sentence_prev)
                                
                                # TODO: CHECK this
                                for word_prev in sentence_prev.words:
                                    word_prev.matched = True
                                    word_prev.used.append(revision_curr.wikipedia_id)
                                    
                                    #if (word_prev.revision in relation.reintroduced.keys()):
                                    #    relation.reintroduced.update({word_prev.revision : relation.reintroduced[word_prev.revision] + 1 })
                                    #else:
                                    #    relation.reintroduced.update({word_prev.revision : 1 })
                                    
                                # Add the sentence information to the paragraph.
                                if (hash_curr in paragraph_curr.sentences.keys()):
                                    paragraph_curr.sentences[hash_curr].append(sentence_prev)
                                    paragraph_curr.ordered_sentences.append(sentence_prev.hash_value)
                                else:
                                    paragraph_curr.sentences.update({sentence_prev.hash_value : [sentence_prev]})
                                    paragraph_curr.ordered_sentences.append(sentence_prev.hash_value) 
                                break
                            elif (matched_all):
                                
                                sentence_prev.matched = True
                                matched_sentences_prev.append(sentence_prev)
                                   
                    if (matched_curr):
                        break
                    
                        
            # Iterate over the hash table of sentences from old revisions.    
            if ((not matched_curr) and (hash_curr in sentences_ht.keys())):
                for sentence_prev in sentences_ht[hash_curr]:
                    if (not sentence_prev.matched):
                        matched_one = False
                        matched_all = True
                        for word_prev in sentence_prev.words:
                            if (word_prev.matched):
                                matched_one = True
                            else:
                                matched_all = False
                            
                        if not(matched_one):
                                    
                            sentence_prev.matched = True
                            matched_curr = True
                            matched_sentences_prev.append(sentence_prev)
                        
                            # TODO: CHECK this
                            for word_prev in sentence_prev.words:
                                word_prev.matched = True
                                word_prev.used.append(revision_curr.wikipedia_id)
                                
                                if (revision_prev.wikipedia_id not in word_prev.used):
                                    word_prev.freq.append(revision_curr.wikipedia_id)
                                
                                # Revert: reintroducing something that somebody else deleted
                                if (revision_prev.wikipedia_id not in word_prev.used):
                                    for elem in word_prev.deleted:
                                        #if (revision_curr.wikipedia_id == 11):
                                        #    print "Revert in 11", word_prev.value, word_prev.deleted, relation.revert
                                        if (elem in revisions.keys()):
                                            if (revisions[elem].contributor_name != revision_curr.contributor_name):
                                                if (elem in relation.revert.keys()):
                                                    relation.revert.update({elem : relation.revert[elem] + 1})
                                                else:
                                                    relation.revert.update({elem : 1})
                                            else:
                                                if (elem in relation.self_revert.keys()):
                                                    relation.self_revert.update({elem : relation.self_revert[elem] + 1})
                                                else:
                                                    relation.self_revert.update({elem : 1})
                                #print "relation.revert", word_prev.value, word_prev.deleted, relation.revert, revision_curr.wikipedia_id
                                        
                                if (revision_prev.wikipedia_id not in word_prev.used):
                                    if (elem in revisions.keys()):
                                        if (revisions[word_prev.revision].contributor_name != revision_curr.contributor_name):
                                            if (word_prev.revision in relation.reintroduced.keys()):
                                                relation.reintroduced.update({word_prev.revision : relation.reintroduced[word_prev.revision] + 1 })
                                            else:
                                                relation.reintroduced.update({word_prev.revision : 1 })
                                        else:
                                            if (word_prev.revision in relation.self_reintroduced.keys()):
                                                relation.self_reintroduced.update({word_prev.revision : relation.self_reintroduced[word_prev.revision] + 1})
                                            else:
                                                relation.self_reintroduced.update({word_prev.revision : 1})
                                            
                                    
                                
                            # Add the sentence information to the paragraph.
                            if (hash_curr in paragraph_curr.sentences.keys()):
                                paragraph_curr.sentences[hash_curr].append(sentence_prev)
                                paragraph_curr.ordered_sentences.append(sentence_prev.hash_value)
                            else:
                                paragraph_curr.sentences.update({sentence_prev.hash_value : [sentence_prev]})
                                paragraph_curr.ordered_sentences.append(sentence_prev.hash_value) 
                            break
                        elif (matched_all):
                            sentence_prev.matched = True
                            matched_sentences_prev.append(sentence_prev)
                            
            
            # If the sentence did not match, then include in the container of unmatched sentences for further analysis.    
            if (not matched_curr):
                sentence_curr = Sentence()
                sentence_curr.value = sentence
                sentence_curr.hash_value = hash_curr
                
                paragraph_curr.ordered_sentences.append(sentence_curr.hash_value)
                if (sentence_curr.hash_value in paragraph_curr.sentences.keys()):
                    paragraph_curr.sentences[sentence_curr.hash_value].append(sentence_curr)
                else:
                    paragraph_curr.sentences.update({sentence_curr.hash_value : [sentence_curr]})
                
                unmatched_sentences_curr.append(sentence_curr)
            
    
    # Identify the unmatched sentences in the previous paragraph revision.            
    for paragraph_prev in unmatched_paragraphs_prev:
        for sentence_prev_hash in paragraph_prev.ordered_sentences:
            for sentence_prev in paragraph_prev.sentences[sentence_prev_hash]:
                if (not sentence_prev.matched):
                    unmatched_sentences_prev.append(sentence_prev)
                    sentence_prev.matched = True
                    matched_sentences_prev.append(sentence_prev)
                    
                
    return (unmatched_sentences_curr, unmatched_sentences_prev, matched_sentences_prev, total_sentences)

Example #10

Show file

def analyseArticle(file_name):

    # Container of revisions.
    revisions = {}
    revision_order = []

    # Revisions to compare.
    revision_curr = Revision()
    revision_prev = Revision()
    text_curr = None

    # Access the file.
    dumpIterator = dump.Iterator(file_name)

    # Iterate over the pages.
    for page in dumpIterator.readPages():
        i = 0

        # Iterate over revisions of the article.
        for revision in page.readRevisions():
            vandalism = False

            # Update the information about the previous revision.
            revision_prev = revision_curr

            if (revision.getSha1() == None):
                revision.setSha1(
                    Text.calculateHash(revision.getText().encode("utf-8")))

            if (revision.getSha1() in spam):
                vandalism = True

            #TODO: SPAM detection: DELETION
            if (revision.getComment() != None
                    and revision.getComment().find(FLAG) > 0):
                pass
            else:
                if (revision_prev.length > PREVIOUS_LENGTH) and (len(
                        revision.getText()) < CURR_LENGTH) and ((
                            (len(revision.getText()) - revision_prev.length) /
                            float(revision_prev.length)) <= CHANGE_PERCENTAGE):
                    print "VANDALISM: CHANGE PERCETANGE"
                    vandalism = True

            #if (vandalism):
            #print "---------------------------- FLAG 1"
            #print revision.getId()
            #print revision.getText()
            #print

            if (not vandalism):
                # Information about the current revision.
                revision_curr = Revision()
                revision_curr.id = i
                revision_curr.wikipedia_id = int(revision.getId())
                revision_curr.length = len(revision.getText())

                # Some revisions don't have contributor.
                if (revision.getContributor() != None):
                    revision_curr.contributor_id = revision.getContributor(
                    ).getId()
                    revision_curr.contributor_name = revision.getContributor(
                    ).getUsername()
                else:
                    revision_curr.contributor_id = 'Not Available'
                    revision_curr.contribur_name = 'Not Available'

                # Content within the revision.
                text_curr = revision.getText().encode('utf-8')
                text_curr = text_curr.lower()
                revision_curr.content = text_curr

                # Perform comparison.
                vandalism = determineAuthorship(revision_curr, revision_prev,
                                                text_curr)

                if (not vandalism):
                    # Add the current revision with all the information.
                    revisions.update(
                        {revision_curr.wikipedia_id: revision_curr})
                    # Update the fake revision id.
                    i = i + 1
                    # Update the index of processed revisions.
                    revision_order.append((revision_curr.wikipedia_id, False))

                else:
                    #print "detected vandalism in here ...................................."
                    #print "---------------------------- FLAG 2"
                    #print revision.getId()
                    #print revision.getText()
                    #print
                    spam.append(revision.getSha1())
                    revision_order.append((revision_curr.wikipedia_id, True))
                    revision_curr = revision_prev

            else:
                #	revision.getText()
                #    #print
                spam.append(revision.getSha1())
                revision_order.append((revision_curr.wikipedia_id, True))
                revision_curr = revision_prev

    return (revisions, revision_order)

Example #11

Show file

def analyseSentencesInParagraphs(unmatched_paragraphs_curr,
                                 unmatched_paragraphs_prev, revision_curr):

    # Containers for unmatched and matched sentences.
    unmatched_sentences_curr = []
    unmatched_sentences_prev = []
    matched_sentences_prev = []
    total_sentences = 0

    # Iterate over the unmatched paragraphs of the current revision.
    for paragraph_curr in unmatched_paragraphs_curr:

        # Split the current paragraph into sentences.
        sentences = Text.splitIntoSentences(paragraph_curr.value)

        # Iterate over the sentences of the current paragraph
        for sentence in sentences:

            # Create the Sentence structure.
            sentence = sentence.strip()
            sentence = ' '.join(Text.splitIntoWords(sentence))
            hash_curr = Text.calculateHash(sentence)
            matched_curr = False
            total_sentences = total_sentences + 1

            # Iterate over the unmatched paragraphs from the previous revision.
            for paragraph_prev in unmatched_paragraphs_prev:
                if (hash_curr in paragraph_prev.sentences.keys()):
                    for sentence_prev in paragraph_prev.sentences[hash_curr]:

                        if (not sentence_prev.matched):

                            matched_one = False
                            matched_all = True
                            for word_prev in sentence_prev.words:
                                if (word_prev.matched):
                                    matched_one = True
                                else:
                                    matched_all = False

                            if not (matched_one):
                                sentence_prev.matched = True
                                matched_curr = True
                                matched_sentences_prev.append(sentence_prev)

                                # TODO: CHECK this
                                for word_prev in sentence_prev.words:
                                    word_prev.matched = True

                                # Add the sentence information to the paragraph.
                                if (hash_curr
                                        in paragraph_curr.sentences.keys()):
                                    paragraph_curr.sentences[hash_curr].append(
                                        sentence_prev)
                                    paragraph_curr.ordered_sentences.append(
                                        sentence_prev.hash_value)
                                else:
                                    paragraph_curr.sentences.update({
                                        sentence_prev.hash_value:
                                        [sentence_prev]
                                    })
                                    paragraph_curr.ordered_sentences.append(
                                        sentence_prev.hash_value)
                                break
                            elif (matched_all):
                                sentence_prev.matched = True
                                matched_sentences_prev.append(sentence_prev)

                    if (matched_curr):
                        break

            # Iterate over the hash table of sentences from old revisions.
            if ((not matched_curr) and (hash_curr in sentences_ht.keys())):
                for sentence_prev in sentences_ht[hash_curr]:
                    if (not sentence_prev.matched):
                        matched_one = False
                        matched_all = True
                        for word_prev in sentence_prev.words:
                            if (word_prev.matched):
                                matched_one = True
                            else:
                                matched_all = False

                        if not (matched_one):

                            sentence_prev.matched = True
                            matched_curr = True
                            matched_sentences_prev.append(sentence_prev)

                            # TODO: CHECK this
                            for word_prev in sentence_prev.words:
                                word_prev.matched = True

                            # Add the sentence information to the paragraph.
                            if (hash_curr in paragraph_curr.sentences.keys()):
                                paragraph_curr.sentences[hash_curr].append(
                                    sentence_prev)
                                paragraph_curr.ordered_sentences.append(
                                    sentence_prev.hash_value)
                            else:
                                paragraph_curr.sentences.update({
                                    sentence_prev.hash_value: [sentence_prev]
                                })
                                paragraph_curr.ordered_sentences.append(
                                    sentence_prev.hash_value)
                            break
                        elif (matched_all):
                            sentence_prev.matched = True
                            matched_sentences_prev.append(sentence_prev)

            # If the sentence did not match, then include in the container of unmatched sentences for further analysis.
            if (not matched_curr):
                sentence_curr = Sentence()
                sentence_curr.value = sentence
                sentence_curr.hash_value = hash_curr

                paragraph_curr.ordered_sentences.append(
                    sentence_curr.hash_value)
                if (sentence_curr.hash_value
                        in paragraph_curr.sentences.keys()):
                    paragraph_curr.sentences[sentence_curr.hash_value].append(
                        sentence_curr)
                else:
                    paragraph_curr.sentences.update(
                        {sentence_curr.hash_value: [sentence_curr]})

                unmatched_sentences_curr.append(sentence_curr)

    # Identify the unmatched sentences in the previous paragraph revision.
    for paragraph_prev in unmatched_paragraphs_prev:
        for sentence_prev_hash in paragraph_prev.ordered_sentences:
            for sentence_prev in paragraph_prev.sentences[sentence_prev_hash]:
                if (not sentence_prev.matched):
                    unmatched_sentences_prev.append(sentence_prev)
                    sentence_prev.matched = True
                    matched_sentences_prev.append(sentence_prev)

    return (unmatched_sentences_curr, unmatched_sentences_prev,
            matched_sentences_prev, total_sentences)

Example #12

Show file

def analyseParagraphsInRevision(revision_curr, revision_prev, text_curr):

    # Containers for unmatched and matched paragraphs.
    unmatched_paragraphs_curr = []
    unmatched_paragraphs_prev = []
    matched_paragraphs_prev = []

    # Split the text of the current into paragraphs.
    paragraphs = Text.splitIntoParagraphs(text_curr)

    # Iterate over the paragraphs of the current version.
    for paragraph in paragraphs:

        # Build Paragraph structure and calculate hash value.
        paragraph = paragraph.strip()
        hash_curr = Text.calculateHash(paragraph)
        matched_curr = False

        # If the paragraph is in the previous revision,
        # update the authorship information and mark both paragraphs as matched (also in HT).
        if (hash_curr in revision_prev.ordered_paragraphs):

            for paragraph_prev in revision_prev.paragraphs[hash_curr]:
                if (not paragraph_prev.matched):
                    matched_curr = True
                    paragraph_prev.matched = True
                    matched_paragraphs_prev.append(paragraph_prev)

                    # TODO: added this (CHECK).
                    for hash_sentence_prev in paragraph_prev.sentences.keys():
                        for sentence_prev in paragraph_prev.sentences[
                                hash_sentence_prev]:
                            sentence_prev.matched = True
                            for word_prev in sentence_prev.words:
                                word_prev.matched = True

                    # Add paragraph to current revision.
                    if (hash_curr in revision_curr.paragraphs.keys()):
                        revision_curr.paragraphs[
                            paragraph_prev.hash_value].append(paragraph_prev)
                        revision_curr.ordered_paragraphs.append(
                            paragraph_prev.hash_value)
                    else:
                        revision_curr.paragraphs.update(
                            {paragraph_prev.hash_value: [paragraph_prev]})
                        revision_curr.ordered_paragraphs.append(
                            paragraph_prev.hash_value)

                    break

        # If the paragraph is not in the previous revision, but it is in an older revision
        # update the authorship information and mark both paragraphs as matched.
        if ((not matched_curr) and (hash_curr in paragraphs_ht)):
            for paragraph_prev in paragraphs_ht[hash_curr]:
                if (not paragraph_prev.matched):
                    matched_curr = True
                    paragraph_prev.matched = True
                    matched_paragraphs_prev.append(paragraph_prev)

                    # TODO: added this (CHECK).
                    for hash_sentence_prev in paragraph_prev.sentences.keys():
                        for sentence_prev in paragraph_prev.sentences[
                                hash_sentence_prev]:
                            sentence_prev.matched = True
                            for word_prev in sentence_prev.words:
                                word_prev.matched = True

                    # Add paragraph to current revision.
                    if (hash_curr in revision_curr.paragraphs.keys()):
                        revision_curr.paragraphs[
                            paragraph_prev.hash_value].append(paragraph_prev)
                        revision_curr.ordered_paragraphs.append(
                            paragraph_prev.hash_value)
                    else:
                        revision_curr.paragraphs.update(
                            {paragraph_prev.hash_value: [paragraph_prev]})
                        revision_curr.ordered_paragraphs.append(
                            paragraph_prev.hash_value)

                    break

        # If the paragraph did not match with previous revisions,
        # add to container of unmatched paragraphs for further analysis.
        if (not matched_curr):
            paragraph_curr = Paragraph()
            paragraph_curr.hash_value = Text.calculateHash(paragraph)
            paragraph_curr.value = paragraph

            revision_curr.ordered_paragraphs.append(paragraph_curr.hash_value)

            if (paragraph_curr.hash_value in revision_curr.paragraphs.keys()):
                revision_curr.paragraphs[paragraph_curr.hash_value].append(
                    paragraph_curr)
            else:
                revision_curr.paragraphs.update(
                    {paragraph_curr.hash_value: [paragraph_curr]})

            unmatched_paragraphs_curr.append(paragraph_curr)

    # Identify unmatched paragraphs in previous revision for further analysis.
    for paragraph_prev_hash in revision_prev.ordered_paragraphs:
        for paragraph_prev in revision_prev.paragraphs[paragraph_prev_hash]:
            if (not paragraph_prev.matched):
                unmatched_paragraphs_prev.append(paragraph_prev)

    return (unmatched_paragraphs_curr, unmatched_paragraphs_prev,
            matched_paragraphs_prev)

Example #13

Show file

File: WikiwhoRelationships.py Project: wikiwho/WikiWho

def analyseArticle(file_name):
    # Container of relationships.
    relations = {}

    # Revisions to compare.
    revision_curr = Revision()
    revision_prev = Revision()
    text_curr = None

    # Access the file.
    dumpIterator = mwIterator.from_file(open_file(file_name))

    # Iterate over the pages.
    for page in dumpIterator:
        i = 0

        # Iterate over revisions of the article.
        for revision in page:
            vandalism = False

            # Update the information about the previous revision.
            revision_prev = revision_curr

            if (revision.sha1 == None):
                revision.sha1 = Text.calculateHash(revision.text)

            if (revision.sha1 in spam):
                vandalism = True

            #TODO: SPAM detection: DELETION
            if (revision.comment!= None and revision.comment.find(FLAG) > 0):
                pass
            else:
                if (revision_prev.length > PREVIOUS_LENGTH) and (len(revision.text) < CURR_LENGTH) and (((len(revision.text)-revision_prev.length)/float(revision_prev.length)) <= CHANGE_PERCENTAGE):
                    vandalism = True
                    revision_curr = revision_prev

            if (not vandalism):
                # Information about the current revision.
                revision_curr = Revision()
                revision_curr.id = i
                revision_curr.wikipedia_id = int(revision.id)
                revision_curr.length = len(revision.text)
                revision_curr.timestamp = revision.timestamp

                # Relation of the current relation.
                relation = Relation()
                relation.revision = int(revision.id)
                relation.length = len(revision.text)

                # Some revisions don't have contributor.
                if (revision.contributor != None):
                    revision_curr.contributor_id = revision.contributor.id
                    revision_curr.contributor_name = revision.contributor.user_text
                    relation.author = revision.contributor.user_text
                else:
                    revision_curr.contributor_id = 'Not Available ' + revision.id
                    revision_curr.contribur_name = 'Not Available ' + revision.id
                    relation.author = 'Not Available ' + revision.id

                # Content within the revision.
                text_curr = revision.text.lower()

                # Perform comparison.
                vandalism = determineAuthorship(revision_curr, revision_prev, text_curr, relation)


                if (not vandalism):
                    # Add the current revision with all the information.
                    revisions.update({revision_curr.wikipedia_id : revision_curr})
                    relations.update({revision_curr.wikipedia_id : relation})
                    revision_order.append((revision_curr.wikipedia_id, False))
                    # Update the fake revision id.
                    i = i+1

                    # Calculate the number of tokens in the revision.
                    total = 0
                    for p in revision_curr.ordered_paragraphs:
                        for paragraph_curr in revision_curr.paragraphs[p]:
                            for hash_sentence_curr in paragraph_curr.sentences.keys():
                                for sentence_curr in paragraph_curr.sentences[hash_sentence_curr]:
                                    total = total + len(sentence_curr.words)
                    revision_curr.total_tokens = total
                    relation.total_tokens = total

                else:
                    revision_order.append((revision_curr.wikipedia_id, True))
                    revision_curr = revision_prev
                    spam.append(revision.sha1)

    return (revisions, revision_order, relations)

Example #14

Show file

def analyseArticle(file_name):
    # Container of relationships.
    relations = {}

    # Revisions to compare.
    revision_curr = Revision()
    revision_prev = Revision()
    text_curr = None

    # Access the file.
    dumpIterator = mwIterator.from_file(open_file(file_name))

    # Iterate over the pages.
    for page in dumpIterator:
        i = 0

        # Iterate over revisions of the article.
        for revision in page:
            vandalism = False

            # Update the information about the previous revision.
            revision_prev = revision_curr

            if (revision.sha1 == None):
                revision.sha1 = Text.calculateHash(revision.text)

            if (revision.sha1 in spam):
                vandalism = True

            #TODO: SPAM detection: DELETION
            if (revision.comment!= None and revision.comment.find(FLAG) > 0):
                pass
            else:
                if (revision_prev.length > PREVIOUS_LENGTH) and (len(revision.text) < CURR_LENGTH) and (((len(revision.text)-revision_prev.length)/float(revision_prev.length)) <= CHANGE_PERCENTAGE):
                    vandalism = True
                    revision_curr = revision_prev

            if (not vandalism):
                # Information about the current revision.
                revision_curr = Revision()
                revision_curr.id = i
                revision_curr.wikipedia_id = int(revision.id)
                revision_curr.length = len(revision.text)
                revision_curr.timestamp = revision.timestamp

                # Relation of the current relation.
                relation = Relation()
                relation.revision = int(revision.id)
                relation.length = len(revision.text)

                # Some revisions don't have contributor.
                if (revision.contributor != None):
                    revision_curr.contributor_id = revision.contributor.id
                    revision_curr.contributor_name = revision.contributor.user_text
                    relation.author = revision.contributor.user_text
                else:
                    revision_curr.contributor_id = 'Not Available ' + revision.id
                    revision_curr.contribur_name = 'Not Available ' + revision.id
                    relation.author = 'Not Available ' + revision.id

                # Content within the revision.
                text_curr = revision.text.lower()

                # Perform comparison.
                vandalism = determineAuthorship(revision_curr, revision_prev, text_curr, relation)


                if (not vandalism):
                    # Add the current revision with all the information.
                    revisions.update({revision_curr.wikipedia_id : revision_curr})
                    relations.update({revision_curr.wikipedia_id : relation})
                    revision_order.append((revision_curr.wikipedia_id, False))
                    # Update the fake revision id.
                    i = i+1

                    # Calculate the number of tokens in the revision.
                    total = 0
                    for p in revision_curr.ordered_paragraphs:
                        for paragraph_curr in revision_curr.paragraphs[p]:
                            for hash_sentence_curr in paragraph_curr.sentences.keys():
                                for sentence_curr in paragraph_curr.sentences[hash_sentence_curr]:
                                    total = total + len(sentence_curr.words)
                    revision_curr.total_tokens = total
                    relation.total_tokens = total

                else:
                    revision_order.append((revision_curr.wikipedia_id, True))
                    revision_curr = revision_prev
                    spam.append(revision.sha1)

    return (revisions, revision_order, relations)