def analyseArticle(self, revisions):
        # Container of revisions.


        # Revisions to compare.
        revision_curr = self.revision_curr
        revision_prev = self.revision_prev
        text_curr = None

        i = 1

        # Iterate over revisions of the article.
        for revision in revisions:
	    
            if 'texthidden' in revision:
                continue
            if 'textmissing' in revision:
                continue
            #revid = revision.getId()
            timestamp = revision['timestamp']

            #timestamp_iso = dateutil.parser.parse(datetime.datetime.utcfromtimestamp(timestamp).isoformat())

            # if timestamp_iso > self.lastrev_date:
            # #print timestamp_iso, self.lastrev_date
            #     revid = revision.getId()
            #     self.lastrev_date = timestamp_iso
            #     self.lastrev = revid

            vandalism = False

            # Update the information about the previous revision.
            revision_prev = revision_curr
            #print "----"
            #print revision
            text = revision['*']

            # if text == None:
            #     text = ''

            if (revision['sha1'] == ""):
                revision['sha1'] = Text.calculateHash(text.encode("utf-8"))

            if (revision['sha1'] in self.spam):
                vandalism = True

            #TODO: self.spam detection: DELETION
            text_len = len(text)

       	    try:
                if (revision['comment'] != '' and 'minor' in revision):
                    pass
            	else:
                    if (revision_prev.length > PREVIOUS_LENGTH) and (text_len < CURR_LENGTH) and (((text_len-revision_prev.length)/float(revision_prev.length)) <= CHANGE_PERCENTAGE):
                        vandalism = True
                        revision_curr = revision_prev
            except:
                pass

            #if (vandalism):
                #print "---------------------------- FLAG 1"
                #print revision.getId()
                #print revision.getText()
                #print

            if (not vandalism):
                # Information about the current revision.
                revision_curr = Revision()
                revision_curr.id = i
                revision_curr.wikipedia_id = int(revision['revid'])
                revision_curr.length = text_len
                revision_curr.time = revision['timestamp']

                #added
                #print "revision_curr.time ", revision_curr.time

                    #datetime.datetime.utcfromtimestamp(revision['timestamp']).isoformat()

                # Some revisions don't have contributor.
                #if (revision.getContributor() != None):
                try:
                    revision_curr.contributor_id = revision['userid']
                except:
                    revision_curr.contributor_id = ""
                try:
                    revision_curr.contributor_name = revision['user']
                except:
                    revision_curr.contributor_name = ""
                #else:
                #revision_curr.contributor_id = 'Not Available'
                #revision_curr.contribur_name = 'Not Available'

                # Content within the revision.
                text_curr = text.encode('utf-8')
                text_curr = text_curr.lower()
                #revision_curr.content = text_curr

                # Perform comparison.
                vandalism = self.determineAuthorship(revision_curr, revision_prev, text_curr)


                if (not vandalism):
                    # Add the current revision with all the information.
                    self.revisions.update({revision_curr.wikipedia_id : revision_curr})
                    # Update the fake revision id.
                    i = i+1

                else:
                    #print "---------------------------- FLAG 2"
                    #print revision.getId()
                    #print revision.getText()
                    #print
                    revision_curr = revision_prev
                    self.spam.append(revision['sha1'])

        self.revision_prev = revision_prev
        self.revision_curr = revision_curr
    def analyseSentencesInParagraphs(self,unmatched_paragraphs_curr, unmatched_paragraphs_prev, revision_curr):

        # Containers for unmatched and matched sentences.
        unmatched_sentences_curr = []
        unmatched_sentences_prev = []
        matched_sentences_prev = []
        total_sentences = 0


        # Iterate over the unmatched paragraphs of the current revision.
        for paragraph_curr in unmatched_paragraphs_curr:

            # Split the current paragraph into sentences.
            sentences = Text.splitIntoSentences(paragraph_curr.value)

            # Iterate over the sentences of the current paragraph
            for sentence in sentences:

                # Create the Sentence structure.
                sentence = sentence.strip()
                sentence = ' '.join(Text.splitIntoWords(sentence))
                hash_curr = Text.calculateHash(sentence)
                matched_curr = False
                total_sentences = total_sentences + 1


                # Iterate over the unmatched paragraphs from the previous revision.
                for paragraph_prev in unmatched_paragraphs_prev:
                    if (hash_curr in paragraph_prev.sentences.keys()):
                        for sentence_prev in paragraph_prev.sentences[hash_curr]:

                            if (not sentence_prev.matched):

                                matched_one = False
                                matched_all = True
                                for word_prev in sentence_prev.words:
                                    if (word_prev.matched):
                                        matched_one = True
                                    else:
                                        matched_all = False

                                if not(matched_one):
                                    sentence_prev.matched = True
                                    matched_curr = True
                                    matched_sentences_prev.append(sentence_prev)

                                    # TODO: CHECK this
                                    for word_prev in sentence_prev.words:
                                        #word_prev.freq = word_prev.freq + 1
                                        #word_prev.freq.append(revision_curr.wikipedia_id)
                                        word_prev.matched = True

                                    # Add the sentence information to the paragraph.
                                    if (hash_curr in paragraph_curr.sentences.keys()):
                                        paragraph_curr.sentences[hash_curr].append(sentence_prev)
                                        paragraph_curr.ordered_sentences.append(sentence_prev.hash_value)
                                    else:
                                        paragraph_curr.sentences.update({sentence_prev.hash_value : [sentence_prev]})
                                        paragraph_curr.ordered_sentences.append(sentence_prev.hash_value)
                                    break
                                elif (matched_all):
                                    sentence_prev.matched = True
                                    matched_sentences_prev.append(sentence_prev)

                        if (matched_curr):
                            break


                # Iterate over the hash table of sentences from old revisions.
                if ((not matched_curr) and (hash_curr in self.sentences_ht.keys())):
                    for sentence_prev in self.sentences_ht[hash_curr]:
                        if (not sentence_prev.matched):
                            matched_one = False
                            matched_all = True
                            for word_prev in sentence_prev.words:
                                if (word_prev.matched):
                                    matched_one = True
                                else:
                                    matched_all = False

                            if not(matched_one):

                                sentence_prev.matched = True
                                matched_curr = True
                                matched_sentences_prev.append(sentence_prev)

                                # TODO: CHECK this
                                for word_prev in sentence_prev.words:
                                    #word_prev.freq.append(revision_curr.wikipedia_id)
                                    #word_prev.freq = word_prev.freq + 1
                                    word_prev.matched = True

                                # Add the sentence information to the paragraph.
                                if (hash_curr in paragraph_curr.sentences.keys()):
                                    paragraph_curr.sentences[hash_curr].append(sentence_prev)
                                    paragraph_curr.ordered_sentences.append(sentence_prev.hash_value)
                                else:
                                    paragraph_curr.sentences.update({sentence_prev.hash_value : [sentence_prev]})
                                    paragraph_curr.ordered_sentences.append(sentence_prev.hash_value)
                                break
                            elif (matched_all):
                                sentence_prev.matched = True
                                matched_sentences_prev.append(sentence_prev)


                # If the sentence did not match, then include in the container of unmatched sentences for further analysis.
                if (not matched_curr):
                    sentence_curr = Sentence()
                    sentence_curr.value = sentence
                    sentence_curr.hash_value = hash_curr

                    paragraph_curr.ordered_sentences.append(sentence_curr.hash_value)
                    if (sentence_curr.hash_value in paragraph_curr.sentences.keys()):
                        paragraph_curr.sentences[sentence_curr.hash_value].append(sentence_curr)
                    else:
                        paragraph_curr.sentences.update({sentence_curr.hash_value : [sentence_curr]})

                    unmatched_sentences_curr.append(sentence_curr)


        # Identify the unmatched sentences in the previous paragraph revision.
        for paragraph_prev in unmatched_paragraphs_prev:
            for sentence_prev_hash in paragraph_prev.ordered_sentences:
                for sentence_prev in paragraph_prev.sentences[sentence_prev_hash]:
                    if (not sentence_prev.matched):
                        unmatched_sentences_prev.append(sentence_prev)
                        sentence_prev.matched = True
                        matched_sentences_prev.append(sentence_prev)


        return (unmatched_sentences_curr, unmatched_sentences_prev, matched_sentences_prev, total_sentences)
    def analyseWordsInSentences(self, unmatched_sentences_curr, unmatched_sentences_prev, revision_curr, possible_vandalism):

        global GLOBAL_ID

        matched_words_prev = []
        unmatched_words_prev = []

        # Split sentences into words.
        text_prev = []
        for sentence_prev in unmatched_sentences_prev:
            for word_prev in sentence_prev.words:
                if (not word_prev.matched):
                    text_prev.append(word_prev.value)
                    unmatched_words_prev.append(word_prev)

        text_curr = []
        for sentence_curr in unmatched_sentences_curr:
            splitted = Text.splitIntoWords(sentence_curr.value)
            text_curr.extend(splitted)
            sentence_curr.splitted.extend(splitted)

        # Edit consists of removing sentences, not adding new content.
        if (len(text_curr) == 0):
            return (matched_words_prev, False)

        # self.spam detection.
        if (possible_vandalism):

            density = Text.computeAvgWordFreq(text_curr, revision_curr.wikipedia_id)

            if (density > WORD_DENSITY):
                return (matched_words_prev, possible_vandalism)
            else:
                possible_vandalism = False

        if (len(text_prev) == 0):
            for sentence_curr in unmatched_sentences_curr:
                for word in sentence_curr.splitted:
                    word_curr = Word()
                    word_curr.author_id = revision_curr.contributor_name
                    word_curr.author_name = revision_curr.contributor_name
                    word_curr.revision = revision_curr.wikipedia_id
                    word_curr.value = word

                    #added
                    word_curr.time = revision_curr.time


                    #word_curr.freq.append(revision_curr.wikipedia_id)
                    word_curr.internal_id = GLOBAL_ID
                    sentence_curr.words.append(word_curr)
                    GLOBAL_ID = GLOBAL_ID + 1


            return (matched_words_prev, possible_vandalism)

        d = Differ()
        diff = list(d.compare(text_prev, text_curr))


        for sentence_curr in unmatched_sentences_curr:

            for word in sentence_curr.splitted:
                curr_matched = False
                pos = 0

                while (pos < len(diff)):

                    word_diff = diff[pos]

                    if (word == word_diff[2:]):

                        if (word_diff[0] == ' '):
                            for word_prev in unmatched_words_prev:
                                if ((not word_prev.matched) and (word_prev.value == word)):
                                    #word_prev.freq = word_prev.freq + 1
                                    #word_prev.freq.append(revision_curr.wikipedia_id)
                                    word_prev.matched = True
                                    curr_matched = True
                                    sentence_curr.words.append(word_prev)
                                    matched_words_prev.append(word_prev)
                                    diff[pos] = ''
                                    pos = len(diff)+1
                                    break

                        elif (word_diff[0] == '-'):
                            for word_prev in unmatched_words_prev:
                                if ((not word_prev.matched) and (word_prev.value == word)):
                                    word_prev.matched = True
                                    #word_prev.deleted.append(revision_curr.wikipedia_id)
                                    matched_words_prev.append(word_prev)
                                    diff[pos] = ''
                                    break

                        elif (word_diff[0] == '+'):
                            curr_matched = True
                            word_curr = Word()
                            word_curr.value = word
                            word_curr.author_id = revision_curr.contributor_name
                            word_curr.author_name = revision_curr.contributor_name
                            word_curr.revision = revision_curr.wikipedia_id
                            word_curr.internal_id = GLOBAL_ID
                            #word_curr.freq.append(revision_curr.wikipedia_id)

                            #added
                            word_curr.time = revision_curr.time
                    
                            sentence_curr.words.append(word_curr)
                            GLOBAL_ID = GLOBAL_ID + 1

                            diff[pos] = ''
                            pos = len(diff)+1

                    pos = pos + 1

                if not(curr_matched):
                    word_curr = Word()
                    word_curr.value = word
                    word_curr.author_id = revision_curr.contributor_name
                    word_curr.author_name = revision_curr.contributor_name
                    word_curr.revision = revision_curr.wikipedia_id
                    #word_curr.freq.append(revision_curr.wikipedia_id)

                    #added
                    word_curr.time = revision_curr.time
                    

                    sentence_curr.words.append(word_curr)
                    word_curr.internal_id = GLOBAL_ID
                    GLOBAL_ID = GLOBAL_ID + 1

        return (matched_words_prev, possible_vandalism)
def analyseWordsInSentences(unmatched_sentences_curr, unmatched_sentences_prev, revision_curr, possible_vandalism, relation):
    """DESCRIPTION
    
    Args:
        param1: The first parameter.
        param2: The second parameter.
    
    Returns:
        True if successful, False otherwise.
    """
    matched_words_prev = []
    unmatched_words_prev = []
    global WORD_ID
    
    # Split sentences into words.
    text_prev = []
    for sentence_prev in unmatched_sentences_prev:
        for word_prev in sentence_prev.words:
            if (not word_prev.matched):
                text_prev.append(word_prev.value)
                unmatched_words_prev.append(word_prev)
        
    text_curr = []
    for sentence_curr in unmatched_sentences_curr:
        splitted = Text.splitIntoWords(sentence_curr.value)
        text_curr.extend(splitted)
        sentence_curr.splitted.extend(splitted)
    
    # Edit consists of removing sentences, not adding new content. 
    if (len(text_curr) == 0):
        return (matched_words_prev, False)
        
    # SPAM detection.
    if (possible_vandalism):

        density = Text.computeAvgWordFreq(text_curr, revision_curr.wikipedia_id)

        if (density > WORD_DENSITY):
            return (matched_words_prev, possible_vandalism)
        else:
            possible_vandalism = False

    if (len(text_prev) == 0):        
        for sentence_curr in unmatched_sentences_curr:
            for word in sentence_curr.splitted:
                word_curr = Word()
                word_curr.internal_id = WORD_ID
                word_curr.author_id = revision_curr.contributor_id
                word_curr.author_name = revision_curr.contributor_name
                word_curr.revision = revision_curr.wikipedia_id
                word_curr.value = word
                sentence_curr.words.append(word_curr)
                word_curr.used.append(revision_curr.wikipedia_id)
                relation.added = relation.added + 1
                WORD_ID = WORD_ID + 1
                
        return (matched_words_prev, possible_vandalism)
    
    d = Differ()
    diff = list(d.compare(text_prev, text_curr))
    
    
    for sentence_curr in unmatched_sentences_curr:

        for word in sentence_curr.splitted:
            curr_matched = False
            pos = 0
                
            while (pos < len(diff)):
                
                word_diff = diff[pos]
                
                if (word == word_diff[2:]): 
                    
                    if (word_diff[0] == ' '):
                        for word_prev in unmatched_words_prev:
                            if ((not word_prev.matched) and (word_prev.value == word)):
                                word_prev.used.append(revision_curr.wikipedia_id)
                                word_prev.matched = True
                                curr_matched = True
                                sentence_curr.words.append(word_prev)
                                matched_words_prev.append(word_prev)
                                diff[pos] = ''
                                pos = len(diff)+1
                                #if (word_prev.revision in relation.reintroduced.keys()):
                                #    relation.reintroduced.update({word_prev.revision : relation.reintroduced[word_prev.revision] + 1 })
                                #else:
                                #    relation.reintroduced.update({word_prev.revision : 1 })
                                    
                                break
                                
                    elif (word_diff[0] == '-'):
                        for word_prev in unmatched_words_prev:
                            if ((not word_prev.matched) and (word_prev.value == word)):
                                word_prev.matched = True
                                matched_words_prev.append(word_prev)
                                diff[pos] = ''
                                word_prev.deleted.append(revision_curr.wikipedia_id)
                                if (revisions[word_prev.revision].contributor_name != revision_curr.contributor_name):
                                    if (word_prev.revision in relation.deleted.keys()):
                                        relation.deleted.update({word_prev.revision : relation.deleted[word_prev.revision] + 1 })
                                    else:
                                        relation.deleted.update({word_prev.revision : 1 })
                                else:
                                    if (word_prev.revision in relation.self_deleted.keys()):
                                        relation.self_deleted.update({word_prev.revision : relation.self_deleted[word_prev.revision] + 1 })
                                    else:
                                        relation.self_deleted.update({word_prev.revision : 1 })
                                break
                                
                    elif (word_diff[0] == '+'):
                        curr_matched = True
                        word_curr = Word()
                        word_curr.internal_id = WORD_ID
                        word_curr.value = word
                        word_curr.author_id = revision_curr.contributor_id
                        word_curr.author_name = revision_curr.contributor_name
                        word_curr.revision = revision_curr.wikipedia_id
                        word_curr.used.append(revision_curr.wikipedia_id)
                        sentence_curr.words.append(word_curr)
                        relation.added = relation.added + 1
                        WORD_ID = WORD_ID + 1

                        diff[pos] = ''
                        pos = len(diff)+1  
                        
                pos = pos + 1
                
            if not(curr_matched):
                word_curr = Word()
                word_curr.internal_id = WORD_ID
                word_curr.value = word
                word_curr.author_id = revision_curr.contributor_id
                word_curr.author_name = revision_curr.contributor_name
                word_curr.revision = revision_curr.wikipedia_id
                word_curr.used.append(revision_curr.wikipedia_id)
                sentence_curr.words.append(word_curr)
                relation.added = relation.added + 1
                WORD_ID = WORD_ID + 1

    return (matched_words_prev, possible_vandalism)
    def analyseParagraphsInRevision(self, revision_curr, revision_prev, text_curr):

        # Containers for unmatched and matched paragraphs.
        unmatched_paragraphs_curr = []
        unmatched_paragraphs_prev = []
        matched_paragraphs_prev = []

        # Split the text of the current into paragraphs.
        paragraphs = Text.splitIntoParagraphs(text_curr)

        # Iterate over the paragraphs of the current version.
        for paragraph in paragraphs:

            # Build Paragraph structure and calculate hash value.
            paragraph = paragraph.strip()
            hash_curr = Text.calculateHash(paragraph)
            matched_curr = False

            # If the paragraph is in the previous revision,
            # update the authorship information and mark both paragraphs as matched (also in HT).
            if (hash_curr in revision_prev.ordered_paragraphs):

                for paragraph_prev in revision_prev.paragraphs[hash_curr]:
                    if (not paragraph_prev.matched):
                        matched_curr = True
                        paragraph_prev.matched = True
                        matched_paragraphs_prev.append(paragraph_prev)

                        # TODO: added this (CHECK).
                        for hash_sentence_prev in paragraph_prev.sentences.keys():
                            for sentence_prev in paragraph_prev.sentences[hash_sentence_prev]:
                                sentence_prev.matched = True
                                for word_prev in sentence_prev.words:
                                    #word_prev.freq = word_prev.freq + 1
                                    #word_prev.freq.append(revision_curr.wikipedia_id)
                                    word_prev.matched = True

                        # Add paragraph to current revision.
                        if (hash_curr in revision_curr.paragraphs.keys()):
                            revision_curr.paragraphs[paragraph_prev.hash_value].append(paragraph_prev)
                            revision_curr.ordered_paragraphs.append(paragraph_prev.hash_value)
                        else:
                            revision_curr.paragraphs.update({paragraph_prev.hash_value : [paragraph_prev]})
                            revision_curr.ordered_paragraphs.append(paragraph_prev.hash_value)

                        break


            # If the paragraph is not in the previous revision, but it is in an older revision
            # update the authorship information and mark both paragraphs as matched.
            if ((not matched_curr) and (hash_curr in self.paragraphs_ht)):
                for paragraph_prev in self.paragraphs_ht[hash_curr]:
                    if (not paragraph_prev.matched):
                        matched_curr = True
                        paragraph_prev.matched = True
                        matched_paragraphs_prev.append(paragraph_prev)

                        # TODO: added this (CHECK).
                        for hash_sentence_prev in paragraph_prev.sentences.keys():
                            for sentence_prev in paragraph_prev.sentences[hash_sentence_prev]:
                                sentence_prev.matched = True
                                for word_prev in sentence_prev.words:
                                    #word_prev.freq = word_prev.freq + 1
                                    #word_prev.freq.append(revision_curr.wikipedia_id)
                                    word_prev.matched = True


                        # Add paragraph to current revision.
                        if (hash_curr in revision_curr.paragraphs.keys()):
                            revision_curr.paragraphs[paragraph_prev.hash_value].append(paragraph_prev)
                            revision_curr.ordered_paragraphs.append(paragraph_prev.hash_value)
                        else:
                            revision_curr.paragraphs.update({paragraph_prev.hash_value : [paragraph_prev]})
                            revision_curr.ordered_paragraphs.append(paragraph_prev.hash_value)

                        break

            # If the paragraph did not match with previous revisions,
            # add to container of unmatched paragraphs for further analysis.
            if (not matched_curr):
                paragraph_curr = Paragraph()
                paragraph_curr.hash_value = Text.calculateHash(paragraph)
                paragraph_curr.value = paragraph

                revision_curr.ordered_paragraphs.append(paragraph_curr.hash_value)

                if (paragraph_curr.hash_value in revision_curr.paragraphs.keys()):
                    revision_curr.paragraphs[paragraph_curr.hash_value].append(paragraph_curr)
                else:
                    revision_curr.paragraphs.update({paragraph_curr.hash_value : [paragraph_curr]})

                unmatched_paragraphs_curr.append(paragraph_curr)


        # Identify unmatched paragraphs in previous revision for further analysis.
        for paragraph_prev_hash in revision_prev.ordered_paragraphs:
            for paragraph_prev in revision_prev.paragraphs[paragraph_prev_hash]:
                if (not paragraph_prev.matched):
                    unmatched_paragraphs_prev.append(paragraph_prev)

        return (unmatched_paragraphs_curr, unmatched_paragraphs_prev, matched_paragraphs_prev)
def analyseSentencesInParagraphs(unmatched_paragraphs_curr, unmatched_paragraphs_prev, revision_curr, revision_prev, relation):
    """DESCRIPTION
    
    Args:
        param1: The first parameter.
        param2: The second parameter.
    
    Returns:
        True if successful, False otherwise.
    """
    # Containers for unmatched and matched sentences.
    unmatched_sentences_curr = []
    unmatched_sentences_prev = []
    matched_sentences_prev = []
    total_sentences = 0
    

    # Iterate over the unmatched paragraphs of the current revision.
    for paragraph_curr in unmatched_paragraphs_curr:
        
        # Split the current paragraph into sentences.
        sentences = Text.splitIntoSentences(paragraph_curr.value)

        # Iterate over the sentences of the current paragraph
        for sentence in sentences:
            
            # Create the Sentence structure.                
            sentence = sentence.strip()
            sentence = ' '.join(Text.splitIntoWords(sentence))
            hash_curr = Text.calculateHash(sentence)
            matched_curr = False
            total_sentences = total_sentences + 1
            
            
            # Iterate over the unmatched paragraphs from the previous revision.
            for paragraph_prev in unmatched_paragraphs_prev:
                if (hash_curr in paragraph_prev.sentences.keys()):
                    for sentence_prev in paragraph_prev.sentences[hash_curr]:
                        
                        if (not sentence_prev.matched): 
                            
                            matched_one = False
                            matched_all = True
                            for word_prev in sentence_prev.words:
                                
                                if (word_prev.matched):
                                    matched_one = True
                                else:
                                    matched_all = False
                                    
                            if not(matched_one):
                                sentence_prev.matched = True
                                matched_curr = True
                                matched_sentences_prev.append(sentence_prev)
                                
                                # TODO: CHECK this
                                for word_prev in sentence_prev.words:
                                    word_prev.matched = True
                                    word_prev.used.append(revision_curr.wikipedia_id)
                                    
                                    #if (word_prev.revision in relation.reintroduced.keys()):
                                    #    relation.reintroduced.update({word_prev.revision : relation.reintroduced[word_prev.revision] + 1 })
                                    #else:
                                    #    relation.reintroduced.update({word_prev.revision : 1 })
                                    
                                # Add the sentence information to the paragraph.
                                if (hash_curr in paragraph_curr.sentences.keys()):
                                    paragraph_curr.sentences[hash_curr].append(sentence_prev)
                                    paragraph_curr.ordered_sentences.append(sentence_prev.hash_value)
                                else:
                                    paragraph_curr.sentences.update({sentence_prev.hash_value : [sentence_prev]})
                                    paragraph_curr.ordered_sentences.append(sentence_prev.hash_value) 
                                break
                            elif (matched_all):
                                
                                sentence_prev.matched = True
                                matched_sentences_prev.append(sentence_prev)
                                   
                    if (matched_curr):
                        break
                    
                        
            # Iterate over the hash table of sentences from old revisions.    
            if ((not matched_curr) and (hash_curr in sentences_ht.keys())):
                for sentence_prev in sentences_ht[hash_curr]:
                    if (not sentence_prev.matched):
                        matched_one = False
                        matched_all = True
                        for word_prev in sentence_prev.words:
                            if (word_prev.matched):
                                matched_one = True
                            else:
                                matched_all = False
                            
                        if not(matched_one):
                                    
                            sentence_prev.matched = True
                            matched_curr = True
                            matched_sentences_prev.append(sentence_prev)
                        
                            # TODO: CHECK this
                            for word_prev in sentence_prev.words:
                                word_prev.matched = True
                                word_prev.used.append(revision_curr.wikipedia_id)
                                
                                if (revision_prev.wikipedia_id not in word_prev.used):
                                    word_prev.freq.append(revision_curr.wikipedia_id)
                                
                                # Revert: reintroducing something that somebody else deleted
                                if (revision_prev.wikipedia_id not in word_prev.used):
                                    for elem in word_prev.deleted:
                                        #if (revision_curr.wikipedia_id == 11):
                                        #    print "Revert in 11", word_prev.value, word_prev.deleted, relation.revert
                                        if (elem in revisions.keys()):
                                            if (revisions[elem].contributor_name != revision_curr.contributor_name):
                                                if (elem in relation.revert.keys()):
                                                    relation.revert.update({elem : relation.revert[elem] + 1})
                                                else:
                                                    relation.revert.update({elem : 1})
                                            else:
                                                if (elem in relation.self_revert.keys()):
                                                    relation.self_revert.update({elem : relation.self_revert[elem] + 1})
                                                else:
                                                    relation.self_revert.update({elem : 1})
                                #print "relation.revert", word_prev.value, word_prev.deleted, relation.revert, revision_curr.wikipedia_id
                                        
                                if (revision_prev.wikipedia_id not in word_prev.used):
                                    if (word_prev.revision in revisions.keys()):
                                        if (revisions[word_prev.revision].contributor_name != revision_curr.contributor_name):
                                            if (word_prev.revision in relation.reintroduced.keys()):
                                                relation.reintroduced.update({word_prev.revision : relation.reintroduced[word_prev.revision] + 1 })
                                            else:
                                                relation.reintroduced.update({word_prev.revision : 1 })
                                        else:
                                            if (word_prev.revision in relation.self_reintroduced.keys()):
                                                relation.self_reintroduced.update({word_prev.revision : relation.self_reintroduced[word_prev.revision] + 1})
                                            else:
                                                relation.self_reintroduced.update({word_prev.revision : 1})
                                            
                                    
                                
                            # Add the sentence information to the paragraph.
                            if (hash_curr in paragraph_curr.sentences.keys()):
                                paragraph_curr.sentences[hash_curr].append(sentence_prev)
                                paragraph_curr.ordered_sentences.append(sentence_prev.hash_value)
                            else:
                                paragraph_curr.sentences.update({sentence_prev.hash_value : [sentence_prev]})
                                paragraph_curr.ordered_sentences.append(sentence_prev.hash_value) 
                            break
                        elif (matched_all):
                            sentence_prev.matched = True
                            matched_sentences_prev.append(sentence_prev)
                            
            
            # If the sentence did not match, then include in the container of unmatched sentences for further analysis.    
            if (not matched_curr):
                sentence_curr = Sentence()
                sentence_curr.value = sentence
                sentence_curr.hash_value = hash_curr
                
                paragraph_curr.ordered_sentences.append(sentence_curr.hash_value)
                if (sentence_curr.hash_value in paragraph_curr.sentences.keys()):
                    paragraph_curr.sentences[sentence_curr.hash_value].append(sentence_curr)
                else:
                    paragraph_curr.sentences.update({sentence_curr.hash_value : [sentence_curr]})
                
                unmatched_sentences_curr.append(sentence_curr)
    
    # Identify the unmatched sentences in the previous paragraph revision.            
    for paragraph_prev in unmatched_paragraphs_prev:
        for sentence_prev_hash in paragraph_prev.ordered_sentences:
            for sentence_prev in paragraph_prev.sentences[sentence_prev_hash]:
                if (not sentence_prev.matched):
                    unmatched_sentences_prev.append(sentence_prev)
                    sentence_prev.matched = True
                    matched_sentences_prev.append(sentence_prev)
                    
    return (unmatched_sentences_curr, unmatched_sentences_prev, matched_sentences_prev, total_sentences)
def analyseArticle(file_name):
"""
    DESCRIPTION
        
    :Parameters:
        NAME : TYPE
            DESCRIPTIOIN
            
    :Return:
        DESCRIPTION
    """    
    # Container of relationships.
    relations = {}
    
    # Revisions to compare.
    revision_curr = Revision()
    revision_prev = Revision()
    text_curr = None

    # Access the file.
    dumpIterator = dump.Iterator(file_name)
    
    # Iterate over the pages.
    for page in dumpIterator.readPages():
        i = 0
        
        # Iterate over revisions of the article.
        for revision in page.readRevisions():
            vandalism = False
            
            #print "processing rev", revision.getId()
            
            # Update the information about the previous revision.
            revision_prev = revision_curr
            
            if (revision.getSha1() == None):
                revision.setSha1(Text.calculateHash(revision.getText().encode("utf-8")))
            
            if (revision.getSha1() in spam):
                vandalism = True
            
            #TODO: SPAM detection: DELETION
            if (revision.getComment()!= None and revision.getComment().find(FLAG) > 0):
                pass
            else:
                if (revision_prev.length > PREVIOUS_LENGTH) and (len(revision.getText()) < CURR_LENGTH) and (((len(revision.getText())-revision_prev.length)/float(revision_prev.length)) <= CHANGE_PERCENTAGE):
                    vandalism = True
                    revision_curr = revision_prev
            
            #if (vandalism):
                #print "---------------------------- FLAG 1"
                #print "SPAM", revision.getId()
                #print revision.getText()           
                #print
            
            if (not vandalism):
                # Information about the current revision.
                revision_curr = Revision()
                revision_curr.id = i
                revision_curr.wikipedia_id = int(revision.getId())
                revision_curr.length = len(revision.getText())
                revision_curr.timestamp = revision.getTimestamp()
                revision_curr.comment = revision.getComment()
                
                # Relation of the current relation.
                relation = Relation()
                relation.revision = int(revision.getId())
                relation.length = len(revision.getText())
                
                # Some revisions don't have contributor.
                if (revision.getContributor() != None):
                    revision_curr.contributor_id = revision.getContributor().getId()
                    revision_curr.contributor_name = revision.getContributor().getUsername().encode('utf-8')
                    relation.author = revision.getContributor().getUsername().encode('utf-8')
                else:
                    revision_curr.contributor_id = 'Not Available ' + revision.getId()
                    revision_curr.contribur_name = 'Not Available ' + revision.getId()
                    relation.author = 'Not Available ' + revision.getId()
                
                # Content within the revision.
                text_curr = revision.getText().encode('utf-8')
                text_curr = text_curr.lower()
                revision_curr.content = text_curr 
                             
                # Perform comparison.
                vandalism = determineAuthorship(revision_curr, revision_prev, text_curr, relation)
                
            
                if (not vandalism):
                    #print "NOT SPAM", revision.getId()
                    
                    # Add the current revision with all the information.
                    revisions.update({revision_curr.wikipedia_id : revision_curr})
                    relations.update({revision_curr.wikipedia_id : relation})
                    revision_order.append((revision_curr.wikipedia_id, False))
                    # Update the fake revision id.
                    i = i+1
                    
                    # Calculate the number of tokens in the revision.
                    total = 0
                    for p in revision_curr.ordered_paragraphs:
                        for paragraph_curr in revision_curr.paragraphs[p]:
                            for hash_sentence_curr in paragraph_curr.sentences.keys():
                                for sentence_curr in paragraph_curr.sentences[hash_sentence_curr]:
                                    total = total + len(sentence_curr.words)
                    revision_curr.total_tokens = total
                    relation.total_tokens = total
                    
                        
                        
                else:
                    #print "---------------------------- FLAG 2"
                    #print "SPAM", revision.getId()
                    #print revision.getText()
                    #print
                    revision_order.append((revision_curr.wikipedia_id, True))
                    revision_curr = revision_prev
                    spam.append(revision.getSha1())
                    
           
    
    return (revisions, revision_order, relations)
def analyseParagraphsInRevision(revision_curr, revision_prev, text_curr, relation):
    """DESCRIPTION
    
    Args:
        param1: The first parameter.
        param2: The second parameter.
    
    Returns:
        True if successful, False otherwise.
    """
    # Containers for unmatched and matched paragraphs.
    unmatched_paragraphs_curr = []
    unmatched_paragraphs_prev = []
    matched_paragraphs_prev = []
    
    # Split the text of the current into paragraphs.
    paragraphs = Text.splitIntoParagraphs(text_curr)
    
    # Iterate over the paragraphs of the current version.
    for paragraph in paragraphs:
        
        # Build Paragraph structure and calculate hash value.
        paragraph = paragraph.strip()
        hash_curr = Text.calculateHash(paragraph)
        matched_curr = False
                    
        # If the paragraph is in the previous revision, 
        # update the authorship information and mark both paragraphs as matched (also in HT).
        if (hash_curr in revision_prev.ordered_paragraphs):

            for paragraph_prev in revision_prev.paragraphs[hash_curr]:
                if (not paragraph_prev.matched):
                    matched_curr = True 
                    paragraph_prev.matched = True
                    matched_paragraphs_prev.append(paragraph_prev)
                    
                    # TODO: added this (CHECK).
                    for hash_sentence_prev in paragraph_prev.sentences.keys():
                        for sentence_prev in paragraph_prev.sentences[hash_sentence_prev]:
                            sentence_prev.matched = True
                            for word_prev in sentence_prev.words:
                                word_prev.matched = True
                                word_prev.used.append(revision_curr.wikipedia_id)
                                
                                #if (word_prev.revision in relation.reintroduced.keys()):
                                #    relation.reintroduced.update({word_prev.revision : relation.reintroduced[word_prev.revision] + 1 })
                                #else:
                                #    relation.reintroduced.update({word_prev.revision : 1 })
                    
                    # Add paragraph to current revision.
                    if (hash_curr in revision_curr.paragraphs.keys()):
                        revision_curr.paragraphs[paragraph_prev.hash_value].append(paragraph_prev)
                        revision_curr.ordered_paragraphs.append(paragraph_prev.hash_value)
                    else:
                        revision_curr.paragraphs.update({paragraph_prev.hash_value : [paragraph_prev]})
                        revision_curr.ordered_paragraphs.append(paragraph_prev.hash_value)

                    break

                    
        # If the paragraph is not in the previous revision, but it is in an older revision
        # update the authorship information and mark both paragraphs as matched. 
        if ((not matched_curr) and (hash_curr in paragraphs_ht)):
            for paragraph_prev in paragraphs_ht[hash_curr]:
                if (not paragraph_prev.matched):
                    matched_curr = True
                    paragraph_prev.matched = True
                    matched_paragraphs_prev.append(paragraph_prev)
                    
                    # TODO: added this (CHECK).
                    for hash_sentence_prev in paragraph_prev.sentences.keys():
                        for sentence_prev in paragraph_prev.sentences[hash_sentence_prev]:
                            sentence_prev.matched = True
                            for word_prev in sentence_prev.words:
                                word_prev.matched = True
                                word_prev.used.append(revision_curr.wikipedia_id)
                                
                                if (revision_prev.wikipedia_id not in word_prev.used):
                                    word_prev.freq.append(revision_curr.wikipedia_id)
                                
                                # Revert: reintroducing something that somebody else deleted, 
                                # (and was not used in the previous revision)
                                if (revision_prev.wikipedia_id not in word_prev.used):
                                    #if (revision_curr.wikipedia_id == 11):
                                    #    print "Revert in 11", word_prev.value, word_prev.deleted, relation.revert
                                    
                                    for elem in word_prev.deleted:
                                        if (elem in revisions.keys()):
                                            if (revisions[elem].contributor_name != revision_curr.contributor_name):
                                                if (elem in relation.revert.keys()):
                                                    relation.revert.update({elem : relation.revert[elem] + 1})
                                                else:
                                                    relation.revert.update({elem : 1})
                                            else:
                                                if (elem in relation.self_revert.keys()):
                                                    relation.self_revert.update({elem : relation.self_revert[elem] + 1})
                                                else:
                                                    relation.self_revert.update({elem : 1})
                                
                                 
                                if (revision_prev.wikipedia_id not in word_prev.used):
                                    if (word_prev.revision in revisions.keys()):
                                        if (revisions[word_prev.revision].contributor_name != revision_curr.contributor_name):
                                            if (word_prev.revision in relation.reintroduced.keys()):
                                                relation.reintroduced.update({word_prev.revision : relation.reintroduced[word_prev.revision] + 1 })
                                            else:
                                                relation.reintroduced.update({word_prev.revision : 1 })
                                        else:
                                            if (word_prev.revision in relation.self_reintroduced.keys()):
                                                relation.self_reintroduced.update({word_prev.revision : relation.self_reintroduced[word_prev.revision] + 1})
                                            else:
                                                relation.self_reintroduced.update({word_prev.revision : 1})
                    
                    # Add paragraph to current revision.
                    if (hash_curr in revision_curr.paragraphs.keys()):
                        revision_curr.paragraphs[paragraph_prev.hash_value].append(paragraph_prev)
                        revision_curr.ordered_paragraphs.append(paragraph_prev.hash_value)
                    else:
                        revision_curr.paragraphs.update({paragraph_prev.hash_value : [paragraph_prev]})
                        revision_curr.ordered_paragraphs.append(paragraph_prev.hash_value)
                    
                    break
            
        # If the paragraph did not match with previous revisions,
        # add to container of unmatched paragraphs for further analysis.
        if (not matched_curr):
            paragraph_curr = Paragraph()
            paragraph_curr.hash_value = Text.calculateHash(paragraph)
            paragraph_curr.value = paragraph

            revision_curr.ordered_paragraphs.append(paragraph_curr.hash_value)
            
            if (paragraph_curr.hash_value in revision_curr.paragraphs.keys()):
                revision_curr.paragraphs[paragraph_curr.hash_value].append(paragraph_curr)
            else:
                revision_curr.paragraphs.update({paragraph_curr.hash_value : [paragraph_curr]})
            
            unmatched_paragraphs_curr.append(paragraph_curr)  
                  
     
    # Identify unmatched paragraphs in previous revision for further analysis.        
    for paragraph_prev_hash in revision_prev.ordered_paragraphs:
        for paragraph_prev in revision_prev.paragraphs[paragraph_prev_hash]:
            if (not paragraph_prev.matched):
                unmatched_paragraphs_prev.append(paragraph_prev)

    return (unmatched_paragraphs_curr, unmatched_paragraphs_prev, matched_paragraphs_prev)
Exemple #9
0
    def analyseArticle(self, revisions):
        # Container of revisions.

        # Revisions to compare.
        revision_curr = self.revision_curr
        revision_prev = self.revision_prev
        text_curr = None

        i = 1

        # Iterate over revisions of the article.
        for revision in revisions:

            if 'texthidden' in revision:
                continue
            if 'textmissing' in revision:
                continue
            #revid = revision.getId()
            timestamp = revision['timestamp']

            #timestamp_iso = dateutil.parser.parse(datetime.datetime.utcfromtimestamp(timestamp).isoformat())

            # if timestamp_iso > self.lastrev_date:
            # #print timestamp_iso, self.lastrev_date
            #     revid = revision.getId()
            #     self.lastrev_date = timestamp_iso
            #     self.lastrev = revid

            vandalism = False

            # Update the information about the previous revision.
            revision_prev = revision_curr
            #print "----"
            #print revision
            text = revision['*']

            # if text == None:
            #     text = ''

            if (revision['sha1'] == ""):
                revision['sha1'] = Text.calculateHash(text.encode("utf-8"))

            if (revision['sha1'] in self.spam):
                vandalism = True

            #TODO: self.spam detection: DELETION
            text_len = len(text)

            try:
                if (revision['comment'] != '' and 'minor' in revision):
                    pass
                else:
                    if (revision_prev.length > PREVIOUS_LENGTH
                        ) and (text_len < CURR_LENGTH) and ((
                            (text_len - revision_prev.length) /
                            float(revision_prev.length)) <= CHANGE_PERCENTAGE):
                        vandalism = True
                        revision_curr = revision_prev
            except:
                pass

            #if (vandalism):
            #print "---------------------------- FLAG 1"
            #print revision.getId()
            #print revision.getText()
            #print

            if (not vandalism):
                # Information about the current revision.
                revision_curr = Revision()
                revision_curr.id = i
                revision_curr.wikipedia_id = int(revision['revid'])
                revision_curr.length = text_len
                revision_curr.time = revision['timestamp']

                #added
                #print "revision_curr.time ", revision_curr.time

                #datetime.datetime.utcfromtimestamp(revision['timestamp']).isoformat()

                # Some revisions don't have contributor.
                #if (revision.getContributor() != None):
                try:
                    revision_curr.contributor_id = revision['userid']
                except:
                    revision_curr.contributor_id = ""
                try:
                    revision_curr.contributor_name = revision['user']
                except:
                    revision_curr.contributor_name = ""
                #else:
                #revision_curr.contributor_id = 'Not Available'
                #revision_curr.contribur_name = 'Not Available'

                # Content within the revision.
                text_curr = text.encode('utf-8')
                text_curr = text_curr.lower()
                #revision_curr.content = text_curr

                # Perform comparison.
                vandalism = self.determineAuthorship(revision_curr,
                                                     revision_prev, text_curr)

                if (not vandalism):
                    # Add the current revision with all the information.
                    self.revisions.update(
                        {revision_curr.wikipedia_id: revision_curr})
                    # Update the fake revision id.
                    i = i + 1

                else:
                    #print "---------------------------- FLAG 2"
                    #print revision.getId()
                    #print revision.getText()
                    #print
                    revision_curr = revision_prev
                    self.spam.append(revision['sha1'])

        self.revision_prev = revision_prev
        self.revision_curr = revision_curr
Exemple #10
0
    def analyseWordsInSentences(self, unmatched_sentences_curr,
                                unmatched_sentences_prev, revision_curr,
                                possible_vandalism):

        global GLOBAL_ID

        matched_words_prev = []
        unmatched_words_prev = []

        # Split sentences into words.
        text_prev = []
        for sentence_prev in unmatched_sentences_prev:
            for word_prev in sentence_prev.words:
                if (not word_prev.matched):
                    text_prev.append(word_prev.value)
                    unmatched_words_prev.append(word_prev)

        text_curr = []
        for sentence_curr in unmatched_sentences_curr:
            splitted = Text.splitIntoWords(sentence_curr.value)
            text_curr.extend(splitted)
            sentence_curr.splitted.extend(splitted)

        # Edit consists of removing sentences, not adding new content.
        if (len(text_curr) == 0):
            return (matched_words_prev, False)

        # self.spam detection.
        if (possible_vandalism):

            density = Text.computeAvgWordFreq(text_curr,
                                              revision_curr.wikipedia_id)

            if (density > WORD_DENSITY):
                return (matched_words_prev, possible_vandalism)
            else:
                possible_vandalism = False

        if (len(text_prev) == 0):
            for sentence_curr in unmatched_sentences_curr:
                for word in sentence_curr.splitted:
                    word_curr = Word()
                    word_curr.author_id = revision_curr.contributor_name
                    word_curr.author_name = revision_curr.contributor_name
                    word_curr.revision = revision_curr.wikipedia_id
                    word_curr.value = word

                    #added
                    word_curr.time = revision_curr.time

                    #word_curr.freq.append(revision_curr.wikipedia_id)
                    word_curr.internal_id = GLOBAL_ID
                    sentence_curr.words.append(word_curr)
                    GLOBAL_ID = GLOBAL_ID + 1

            return (matched_words_prev, possible_vandalism)

        d = Differ()
        diff = list(d.compare(text_prev, text_curr))

        for sentence_curr in unmatched_sentences_curr:

            for word in sentence_curr.splitted:
                curr_matched = False
                pos = 0

                while (pos < len(diff)):

                    word_diff = diff[pos]

                    if (word == word_diff[2:]):

                        if (word_diff[0] == ' '):
                            for word_prev in unmatched_words_prev:
                                if ((not word_prev.matched)
                                        and (word_prev.value == word)):
                                    #word_prev.freq = word_prev.freq + 1
                                    #word_prev.freq.append(revision_curr.wikipedia_id)
                                    word_prev.matched = True
                                    curr_matched = True
                                    sentence_curr.words.append(word_prev)
                                    matched_words_prev.append(word_prev)
                                    diff[pos] = ''
                                    pos = len(diff) + 1
                                    break

                        elif (word_diff[0] == '-'):
                            for word_prev in unmatched_words_prev:
                                if ((not word_prev.matched)
                                        and (word_prev.value == word)):
                                    word_prev.matched = True
                                    #word_prev.deleted.append(revision_curr.wikipedia_id)
                                    matched_words_prev.append(word_prev)
                                    diff[pos] = ''
                                    break

                        elif (word_diff[0] == '+'):
                            curr_matched = True
                            word_curr = Word()
                            word_curr.value = word
                            word_curr.author_id = revision_curr.contributor_name
                            word_curr.author_name = revision_curr.contributor_name
                            word_curr.revision = revision_curr.wikipedia_id
                            word_curr.internal_id = GLOBAL_ID
                            #word_curr.freq.append(revision_curr.wikipedia_id)

                            #added
                            word_curr.time = revision_curr.time

                            sentence_curr.words.append(word_curr)
                            GLOBAL_ID = GLOBAL_ID + 1

                            diff[pos] = ''
                            pos = len(diff) + 1

                    pos = pos + 1

                if not (curr_matched):
                    word_curr = Word()
                    word_curr.value = word
                    word_curr.author_id = revision_curr.contributor_name
                    word_curr.author_name = revision_curr.contributor_name
                    word_curr.revision = revision_curr.wikipedia_id
                    #word_curr.freq.append(revision_curr.wikipedia_id)

                    #added
                    word_curr.time = revision_curr.time

                    sentence_curr.words.append(word_curr)
                    word_curr.internal_id = GLOBAL_ID
                    GLOBAL_ID = GLOBAL_ID + 1

        return (matched_words_prev, possible_vandalism)
Exemple #11
0
    def analyseSentencesInParagraphs(self, unmatched_paragraphs_curr,
                                     unmatched_paragraphs_prev, revision_curr):

        # Containers for unmatched and matched sentences.
        unmatched_sentences_curr = []
        unmatched_sentences_prev = []
        matched_sentences_prev = []
        total_sentences = 0

        # Iterate over the unmatched paragraphs of the current revision.
        for paragraph_curr in unmatched_paragraphs_curr:

            # Split the current paragraph into sentences.
            sentences = Text.splitIntoSentences(paragraph_curr.value)

            # Iterate over the sentences of the current paragraph
            for sentence in sentences:

                # Create the Sentence structure.
                sentence = sentence.strip()
                sentence = ' '.join(Text.splitIntoWords(sentence))
                hash_curr = Text.calculateHash(sentence)
                matched_curr = False
                total_sentences = total_sentences + 1

                # Iterate over the unmatched paragraphs from the previous revision.
                for paragraph_prev in unmatched_paragraphs_prev:
                    if (hash_curr in paragraph_prev.sentences.keys()):
                        for sentence_prev in paragraph_prev.sentences[
                                hash_curr]:

                            if (not sentence_prev.matched):

                                matched_one = False
                                matched_all = True
                                for word_prev in sentence_prev.words:
                                    if (word_prev.matched):
                                        matched_one = True
                                    else:
                                        matched_all = False

                                if not (matched_one):
                                    sentence_prev.matched = True
                                    matched_curr = True
                                    matched_sentences_prev.append(
                                        sentence_prev)

                                    # TODO: CHECK this
                                    for word_prev in sentence_prev.words:
                                        #word_prev.freq = word_prev.freq + 1
                                        #word_prev.freq.append(revision_curr.wikipedia_id)
                                        word_prev.matched = True

                                    # Add the sentence information to the paragraph.
                                    if (hash_curr in
                                            paragraph_curr.sentences.keys()):
                                        paragraph_curr.sentences[
                                            hash_curr].append(sentence_prev)
                                        paragraph_curr.ordered_sentences.append(
                                            sentence_prev.hash_value)
                                    else:
                                        paragraph_curr.sentences.update({
                                            sentence_prev.hash_value:
                                            [sentence_prev]
                                        })
                                        paragraph_curr.ordered_sentences.append(
                                            sentence_prev.hash_value)
                                    break
                                elif (matched_all):
                                    sentence_prev.matched = True
                                    matched_sentences_prev.append(
                                        sentence_prev)

                        if (matched_curr):
                            break

                # Iterate over the hash table of sentences from old revisions.
                if ((not matched_curr)
                        and (hash_curr in self.sentences_ht.keys())):
                    for sentence_prev in self.sentences_ht[hash_curr]:
                        if (not sentence_prev.matched):
                            matched_one = False
                            matched_all = True
                            for word_prev in sentence_prev.words:
                                if (word_prev.matched):
                                    matched_one = True
                                else:
                                    matched_all = False

                            if not (matched_one):

                                sentence_prev.matched = True
                                matched_curr = True
                                matched_sentences_prev.append(sentence_prev)

                                # TODO: CHECK this
                                for word_prev in sentence_prev.words:
                                    #word_prev.freq.append(revision_curr.wikipedia_id)
                                    #word_prev.freq = word_prev.freq + 1
                                    word_prev.matched = True

                                # Add the sentence information to the paragraph.
                                if (hash_curr
                                        in paragraph_curr.sentences.keys()):
                                    paragraph_curr.sentences[hash_curr].append(
                                        sentence_prev)
                                    paragraph_curr.ordered_sentences.append(
                                        sentence_prev.hash_value)
                                else:
                                    paragraph_curr.sentences.update({
                                        sentence_prev.hash_value:
                                        [sentence_prev]
                                    })
                                    paragraph_curr.ordered_sentences.append(
                                        sentence_prev.hash_value)
                                break
                            elif (matched_all):
                                sentence_prev.matched = True
                                matched_sentences_prev.append(sentence_prev)

                # If the sentence did not match, then include in the container of unmatched sentences for further analysis.
                if (not matched_curr):
                    sentence_curr = Sentence()
                    sentence_curr.value = sentence
                    sentence_curr.hash_value = hash_curr

                    paragraph_curr.ordered_sentences.append(
                        sentence_curr.hash_value)
                    if (sentence_curr.hash_value
                            in paragraph_curr.sentences.keys()):
                        paragraph_curr.sentences[
                            sentence_curr.hash_value].append(sentence_curr)
                    else:
                        paragraph_curr.sentences.update(
                            {sentence_curr.hash_value: [sentence_curr]})

                    unmatched_sentences_curr.append(sentence_curr)

        # Identify the unmatched sentences in the previous paragraph revision.
        for paragraph_prev in unmatched_paragraphs_prev:
            for sentence_prev_hash in paragraph_prev.ordered_sentences:
                for sentence_prev in paragraph_prev.sentences[
                        sentence_prev_hash]:
                    if (not sentence_prev.matched):
                        unmatched_sentences_prev.append(sentence_prev)
                        sentence_prev.matched = True
                        matched_sentences_prev.append(sentence_prev)

        return (unmatched_sentences_curr, unmatched_sentences_prev,
                matched_sentences_prev, total_sentences)
Exemple #12
0
    def analyseParagraphsInRevision(self, revision_curr, revision_prev,
                                    text_curr):

        # Containers for unmatched and matched paragraphs.
        unmatched_paragraphs_curr = []
        unmatched_paragraphs_prev = []
        matched_paragraphs_prev = []

        # Split the text of the current into paragraphs.
        paragraphs = Text.splitIntoParagraphs(text_curr)

        # Iterate over the paragraphs of the current version.
        for paragraph in paragraphs:

            # Build Paragraph structure and calculate hash value.
            paragraph = paragraph.strip()
            hash_curr = Text.calculateHash(paragraph)
            matched_curr = False

            # If the paragraph is in the previous revision,
            # update the authorship information and mark both paragraphs as matched (also in HT).
            if (hash_curr in revision_prev.ordered_paragraphs):

                for paragraph_prev in revision_prev.paragraphs[hash_curr]:
                    if (not paragraph_prev.matched):
                        matched_curr = True
                        paragraph_prev.matched = True
                        matched_paragraphs_prev.append(paragraph_prev)

                        # TODO: added this (CHECK).
                        for hash_sentence_prev in paragraph_prev.sentences.keys(
                        ):
                            for sentence_prev in paragraph_prev.sentences[
                                    hash_sentence_prev]:
                                sentence_prev.matched = True
                                for word_prev in sentence_prev.words:
                                    #word_prev.freq = word_prev.freq + 1
                                    #word_prev.freq.append(revision_curr.wikipedia_id)
                                    word_prev.matched = True

                        # Add paragraph to current revision.
                        if (hash_curr in revision_curr.paragraphs.keys()):
                            revision_curr.paragraphs[
                                paragraph_prev.hash_value].append(
                                    paragraph_prev)
                            revision_curr.ordered_paragraphs.append(
                                paragraph_prev.hash_value)
                        else:
                            revision_curr.paragraphs.update(
                                {paragraph_prev.hash_value: [paragraph_prev]})
                            revision_curr.ordered_paragraphs.append(
                                paragraph_prev.hash_value)

                        break

            # If the paragraph is not in the previous revision, but it is in an older revision
            # update the authorship information and mark both paragraphs as matched.
            if ((not matched_curr) and (hash_curr in self.paragraphs_ht)):
                for paragraph_prev in self.paragraphs_ht[hash_curr]:
                    if (not paragraph_prev.matched):
                        matched_curr = True
                        paragraph_prev.matched = True
                        matched_paragraphs_prev.append(paragraph_prev)

                        # TODO: added this (CHECK).
                        for hash_sentence_prev in paragraph_prev.sentences.keys(
                        ):
                            for sentence_prev in paragraph_prev.sentences[
                                    hash_sentence_prev]:
                                sentence_prev.matched = True
                                for word_prev in sentence_prev.words:
                                    #word_prev.freq = word_prev.freq + 1
                                    #word_prev.freq.append(revision_curr.wikipedia_id)
                                    word_prev.matched = True

                        # Add paragraph to current revision.
                        if (hash_curr in revision_curr.paragraphs.keys()):
                            revision_curr.paragraphs[
                                paragraph_prev.hash_value].append(
                                    paragraph_prev)
                            revision_curr.ordered_paragraphs.append(
                                paragraph_prev.hash_value)
                        else:
                            revision_curr.paragraphs.update(
                                {paragraph_prev.hash_value: [paragraph_prev]})
                            revision_curr.ordered_paragraphs.append(
                                paragraph_prev.hash_value)

                        break

            # If the paragraph did not match with previous revisions,
            # add to container of unmatched paragraphs for further analysis.
            if (not matched_curr):
                paragraph_curr = Paragraph()
                paragraph_curr.hash_value = Text.calculateHash(paragraph)
                paragraph_curr.value = paragraph

                revision_curr.ordered_paragraphs.append(
                    paragraph_curr.hash_value)

                if (paragraph_curr.hash_value
                        in revision_curr.paragraphs.keys()):
                    revision_curr.paragraphs[paragraph_curr.hash_value].append(
                        paragraph_curr)
                else:
                    revision_curr.paragraphs.update(
                        {paragraph_curr.hash_value: [paragraph_curr]})

                unmatched_paragraphs_curr.append(paragraph_curr)

        # Identify unmatched paragraphs in previous revision for further analysis.
        for paragraph_prev_hash in revision_prev.ordered_paragraphs:
            for paragraph_prev in revision_prev.paragraphs[
                    paragraph_prev_hash]:
                if (not paragraph_prev.matched):
                    unmatched_paragraphs_prev.append(paragraph_prev)

        return (unmatched_paragraphs_curr, unmatched_paragraphs_prev,
                matched_paragraphs_prev)