def analyseArticle(file_name): # Container of revisions. revisions = {} # Revisions to compare. revision_curr = Revision() revision_prev = Revision() text_curr = None # Access the file. dumpIterator = dump.Iterator(file_name) # Iterate over the pages. for page in dumpIterator.readPages(): i = 0 # Iterate over revisions of the article. for revision in page.readRevisions(): vandalism = False # Update the information about the previous revision. revision_prev = revision_curr if (revision.getSha1() == None): revision.setSha1(Text.calculateHash(revision.getText().encode("utf-8"))) if (revision.getSha1() in spam): vandalism = True #TODO: SPAM detection: DELETION if (revision.getComment()!= None and revision.getComment().find(FLAG) > 0): pass else: if (revision_prev.length > PREVIOUS_LENGTH) and (len(revision.getText()) < CURR_LENGTH) and (((len(revision.getText())-revision_prev.length)/float(revision_prev.length)) <= CHANGE_PERCENTAGE): vandalism = True revision_curr = revision_prev #if (vandalism): #print "---------------------------- FLAG 1" #print revision.getId() #print revision.getText() #print if (not vandalism): # Information about the current revision. revision_curr = Revision() revision_curr.id = i revision_curr.wikipedia_id = int(revision.getId()) revision_curr.length = len(revision.getText()) # Some revisions don't have contributor. if (revision.getContributor() != None): revision_curr.contributor_id = revision.getContributor().getId() revision_curr.contributor_name = revision.getContributor().getUsername() else: revision_curr.contributor_id = 'Not Available' revision_curr.contribur_name = 'Not Available' # Content within the revision. text_curr = revision.getText().encode('utf-8') text_curr = text_curr.lower() revision_curr.content = text_curr # Perform comparison. vandalism = determineAuthorship(revision_curr, revision_prev, text_curr) if (not vandalism): # Add the current revision with all the information. revisions.update({revision_curr.wikipedia_id : revision_curr}) # Update the fake revision id. i = i+1 else: #print "---------------------------- FLAG 2" #print revision.getId() #print revision.getText() #print revision_curr = revision_prev spam.append(revision.getSha1()) return revisions
def analyseParagraphsInRevision(revision_curr, revision_prev, text_curr): # Containers for unmatched and matched paragraphs. unmatched_paragraphs_curr = [] unmatched_paragraphs_prev = [] matched_paragraphs_prev = [] # Split the text of the current into paragraphs. paragraphs = Text.splitIntoParagraphs(text_curr) # Iterate over the paragraphs of the current version. for paragraph in paragraphs: # Build Paragraph structure and calculate hash value. paragraph = paragraph.strip() hash_curr = Text.calculateHash(paragraph) matched_curr = False # If the paragraph is in the previous revision, # update the authorship information and mark both paragraphs as matched (also in HT). if (hash_curr in revision_prev.ordered_paragraphs): for paragraph_prev in revision_prev.paragraphs[hash_curr]: if (not paragraph_prev.matched): matched_curr = True paragraph_prev.matched = True matched_paragraphs_prev.append(paragraph_prev) # TODO: added this (CHECK). for hash_sentence_prev in paragraph_prev.sentences.keys(): for sentence_prev in paragraph_prev.sentences[hash_sentence_prev]: sentence_prev.matched = True for word_prev in sentence_prev.words: word_prev.matched = True # Add paragraph to current revision. if (hash_curr in revision_curr.paragraphs.keys()): revision_curr.paragraphs[paragraph_prev.hash_value].append(paragraph_prev) revision_curr.ordered_paragraphs.append(paragraph_prev.hash_value) else: revision_curr.paragraphs.update({paragraph_prev.hash_value : [paragraph_prev]}) revision_curr.ordered_paragraphs.append(paragraph_prev.hash_value) break # If the paragraph is not in the previous revision, but it is in an older revision # update the authorship information and mark both paragraphs as matched. if ((not matched_curr) and (hash_curr in paragraphs_ht)): for paragraph_prev in paragraphs_ht[hash_curr]: if (not paragraph_prev.matched): matched_curr = True paragraph_prev.matched = True matched_paragraphs_prev.append(paragraph_prev) # TODO: added this (CHECK). for hash_sentence_prev in paragraph_prev.sentences.keys(): for sentence_prev in paragraph_prev.sentences[hash_sentence_prev]: sentence_prev.matched = True for word_prev in sentence_prev.words: word_prev.matched = True # Add paragraph to current revision. if (hash_curr in revision_curr.paragraphs.keys()): revision_curr.paragraphs[paragraph_prev.hash_value].append(paragraph_prev) revision_curr.ordered_paragraphs.append(paragraph_prev.hash_value) else: revision_curr.paragraphs.update({paragraph_prev.hash_value : [paragraph_prev]}) revision_curr.ordered_paragraphs.append(paragraph_prev.hash_value) break # If the paragraph did not match with previous revisions, # add to container of unmatched paragraphs for further analysis. if (not matched_curr): paragraph_curr = Paragraph() paragraph_curr.hash_value = Text.calculateHash(paragraph) paragraph_curr.value = paragraph revision_curr.ordered_paragraphs.append(paragraph_curr.hash_value) if (paragraph_curr.hash_value in revision_curr.paragraphs.keys()): revision_curr.paragraphs[paragraph_curr.hash_value].append(paragraph_curr) else: revision_curr.paragraphs.update({paragraph_curr.hash_value : [paragraph_curr]}) unmatched_paragraphs_curr.append(paragraph_curr) # Identify unmatched paragraphs in previous revision for further analysis. for paragraph_prev_hash in revision_prev.ordered_paragraphs: for paragraph_prev in revision_prev.paragraphs[paragraph_prev_hash]: if (not paragraph_prev.matched): unmatched_paragraphs_prev.append(paragraph_prev) return (unmatched_paragraphs_curr, unmatched_paragraphs_prev, matched_paragraphs_prev)
def analyseSentencesInParagraphs(unmatched_paragraphs_curr, unmatched_paragraphs_prev, revision_curr): # Containers for unmatched and matched sentences. unmatched_sentences_curr = [] unmatched_sentences_prev = [] matched_sentences_prev = [] total_sentences = 0 # Iterate over the unmatched paragraphs of the current revision. for paragraph_curr in unmatched_paragraphs_curr: # Split the current paragraph into sentences. sentences = Text.splitIntoSentences(paragraph_curr.value) # Iterate over the sentences of the current paragraph for sentence in sentences: # Create the Sentence structure. sentence = sentence.strip() sentence = ' '.join(Text.splitIntoWords(sentence)) hash_curr = Text.calculateHash(sentence) matched_curr = False total_sentences = total_sentences + 1 # Iterate over the unmatched paragraphs from the previous revision. for paragraph_prev in unmatched_paragraphs_prev: if (hash_curr in paragraph_prev.sentences.keys()): for sentence_prev in paragraph_prev.sentences[hash_curr]: if (not sentence_prev.matched): matched_one = False matched_all = True for word_prev in sentence_prev.words: if (word_prev.matched): matched_one = True else: matched_all = False if not(matched_one): sentence_prev.matched = True matched_curr = True matched_sentences_prev.append(sentence_prev) # TODO: CHECK this for word_prev in sentence_prev.words: word_prev.matched = True # Add the sentence information to the paragraph. if (hash_curr in paragraph_curr.sentences.keys()): paragraph_curr.sentences[hash_curr].append(sentence_prev) paragraph_curr.ordered_sentences.append(sentence_prev.hash_value) else: paragraph_curr.sentences.update({sentence_prev.hash_value : [sentence_prev]}) paragraph_curr.ordered_sentences.append(sentence_prev.hash_value) break elif (matched_all): sentence_prev.matched = True matched_sentences_prev.append(sentence_prev) if (matched_curr): break # Iterate over the hash table of sentences from old revisions. if ((not matched_curr) and (hash_curr in sentences_ht.keys())): for sentence_prev in sentences_ht[hash_curr]: if (not sentence_prev.matched): matched_one = False matched_all = True for word_prev in sentence_prev.words: if (word_prev.matched): matched_one = True else: matched_all = False if not(matched_one): sentence_prev.matched = True matched_curr = True matched_sentences_prev.append(sentence_prev) # TODO: CHECK this for word_prev in sentence_prev.words: word_prev.matched = True # Add the sentence information to the paragraph. if (hash_curr in paragraph_curr.sentences.keys()): paragraph_curr.sentences[hash_curr].append(sentence_prev) paragraph_curr.ordered_sentences.append(sentence_prev.hash_value) else: paragraph_curr.sentences.update({sentence_prev.hash_value : [sentence_prev]}) paragraph_curr.ordered_sentences.append(sentence_prev.hash_value) break elif (matched_all): sentence_prev.matched = True matched_sentences_prev.append(sentence_prev) # If the sentence did not match, then include in the container of unmatched sentences for further analysis. if (not matched_curr): sentence_curr = Sentence() sentence_curr.value = sentence sentence_curr.hash_value = hash_curr paragraph_curr.ordered_sentences.append(sentence_curr.hash_value) if (sentence_curr.hash_value in paragraph_curr.sentences.keys()): paragraph_curr.sentences[sentence_curr.hash_value].append(sentence_curr) else: paragraph_curr.sentences.update({sentence_curr.hash_value : [sentence_curr]}) unmatched_sentences_curr.append(sentence_curr) # Identify the unmatched sentences in the previous paragraph revision. for paragraph_prev in unmatched_paragraphs_prev: for sentence_prev_hash in paragraph_prev.ordered_sentences: for sentence_prev in paragraph_prev.sentences[sentence_prev_hash]: if (not sentence_prev.matched): unmatched_sentences_prev.append(sentence_prev) sentence_prev.matched = True matched_sentences_prev.append(sentence_prev) return (unmatched_sentences_curr, unmatched_sentences_prev, matched_sentences_prev, total_sentences)
def analyseArticle(file_name): # Container of relationships. relations = {} # Revisions to compare. revision_curr = Revision() revision_prev = Revision() text_curr = None # Access the file. dumpIterator = dump.Iterator(file_name) # Iterate over the pages. for page in dumpIterator.readPages(): i = 0 # Iterate over revisions of the article. for revision in page.readRevisions(): vandalism = False #print "processing rev", revision.getId() # Update the information about the previous revision. revision_prev = revision_curr if (revision.getSha1() == None): revision.setSha1( Text.calculateHash(revision.getText().encode("utf-8"))) if (revision.getSha1() in spam): vandalism = True #TODO: SPAM detection: DELETION if (revision.getComment() != None and revision.getComment().find(FLAG) > 0): pass else: if (revision_prev.length > PREVIOUS_LENGTH) and (len( revision.getText()) < CURR_LENGTH) and (( (len(revision.getText()) - revision_prev.length) / float(revision_prev.length)) <= CHANGE_PERCENTAGE): vandalism = True revision_curr = revision_prev #if (vandalism): #print "---------------------------- FLAG 1" #print "SPAM", revision.getId() #print revision.getText() #print if (not vandalism): # Information about the current revision. revision_curr = Revision() revision_curr.id = i revision_curr.wikipedia_id = int(revision.getId()) revision_curr.length = len(revision.getText()) revision_curr.timestamp = revision.getTimestamp() revision_curr.comment = revision.getComment() # Relation of the current relation. relation = Relation() relation.revision = int(revision.getId()) relation.length = len(revision.getText()) # Some revisions don't have contributor. if (revision.getContributor() != None): revision_curr.contributor_id = revision.getContributor( ).getId() revision_curr.contributor_name = revision.getContributor( ).getUsername().encode('utf-8') relation.author = revision.getContributor().getUsername( ).encode('utf-8') else: revision_curr.contributor_id = 'Not Available ' + revision.getId( ) revision_curr.contribur_name = 'Not Available ' + revision.getId( ) relation.author = 'Not Available ' + revision.getId() # Content within the revision. text_curr = revision.getText().encode('utf-8') text_curr = text_curr.lower() revision_curr.content = text_curr # Perform comparison. vandalism = determineAuthorship(revision_curr, revision_prev, text_curr, relation) if (not vandalism): #print "NOT SPAM", revision.getId() # Add the current revision with all the information. revisions.update( {revision_curr.wikipedia_id: revision_curr}) relations.update({revision_curr.wikipedia_id: relation}) revision_order.append((revision_curr.wikipedia_id, False)) # Update the fake revision id. i = i + 1 # Calculate the number of tokens in the revision. total = 0 for p in revision_curr.ordered_paragraphs: for paragraph_curr in revision_curr.paragraphs[p]: for hash_sentence_curr in paragraph_curr.sentences.keys( ): for sentence_curr in paragraph_curr.sentences[ hash_sentence_curr]: total = total + len(sentence_curr.words) revision_curr.total_tokens = total relation.total_tokens = total else: #print "---------------------------- FLAG 2" #print "SPAM", revision.getId() #print revision.getText() #print revision_order.append((revision_curr.wikipedia_id, True)) revision_curr = revision_prev spam.append(revision.getSha1()) return (revisions, revision_order, relations)
def analyseSentencesInParagraphs(unmatched_paragraphs_curr, unmatched_paragraphs_prev, revision_curr, revision_prev, relation): # Containers for unmatched and matched sentences. unmatched_sentences_curr = [] unmatched_sentences_prev = [] matched_sentences_prev = [] total_sentences = 0 # Iterate over the unmatched paragraphs of the current revision. for paragraph_curr in unmatched_paragraphs_curr: # Split the current paragraph into sentences. sentences = Text.splitIntoSentences(paragraph_curr.value) # Iterate over the sentences of the current paragraph for sentence in sentences: # Create the Sentence structure. sentence = sentence.strip() sentence = ' '.join(Text.splitIntoWords(sentence)) hash_curr = Text.calculateHash(sentence) matched_curr = False total_sentences = total_sentences + 1 # Iterate over the unmatched paragraphs from the previous revision. for paragraph_prev in unmatched_paragraphs_prev: if (hash_curr in paragraph_prev.sentences.keys()): for sentence_prev in paragraph_prev.sentences[hash_curr]: if (not sentence_prev.matched): matched_one = False matched_all = True for word_prev in sentence_prev.words: if (word_prev.matched): matched_one = True else: matched_all = False if not (matched_one): sentence_prev.matched = True matched_curr = True matched_sentences_prev.append(sentence_prev) # TODO: CHECK this for word_prev in sentence_prev.words: word_prev.matched = True word_prev.used.append( revision_curr.wikipedia_id) #if (word_prev.revision in relation.reintroduced.keys()): # relation.reintroduced.update({word_prev.revision : relation.reintroduced[word_prev.revision] + 1 }) #else: # relation.reintroduced.update({word_prev.revision : 1 }) # Add the sentence information to the paragraph. if (hash_curr in paragraph_curr.sentences.keys()): paragraph_curr.sentences[hash_curr].append( sentence_prev) paragraph_curr.ordered_sentences.append( sentence_prev.hash_value) else: paragraph_curr.sentences.update({ sentence_prev.hash_value: [sentence_prev] }) paragraph_curr.ordered_sentences.append( sentence_prev.hash_value) break elif (matched_all): sentence_prev.matched = True matched_sentences_prev.append(sentence_prev) if (matched_curr): break # Iterate over the hash table of sentences from old revisions. if ((not matched_curr) and (hash_curr in sentences_ht.keys())): for sentence_prev in sentences_ht[hash_curr]: if (not sentence_prev.matched): matched_one = False matched_all = True for word_prev in sentence_prev.words: if (word_prev.matched): matched_one = True else: matched_all = False if not (matched_one): sentence_prev.matched = True matched_curr = True matched_sentences_prev.append(sentence_prev) # TODO: CHECK this for word_prev in sentence_prev.words: word_prev.matched = True word_prev.used.append( revision_curr.wikipedia_id) if (revision_prev.wikipedia_id not in word_prev.used): word_prev.freq.append( revision_curr.wikipedia_id) # Revert: reintroducing something that somebody else deleted if (revision_prev.wikipedia_id not in word_prev.used): for elem in word_prev.deleted: #if (revision_curr.wikipedia_id == 11): # print "Revert in 11", word_prev.value, word_prev.deleted, relation.revert if (elem in revisions.keys()): if (revisions[elem]. contributor_name != revision_curr. contributor_name): if (elem in relation.revert. keys()): relation.revert.update({ elem: relation.revert[elem] + 1 }) else: relation.revert.update( {elem: 1}) else: if (elem in relation. self_revert.keys()): relation.self_revert.update( { elem: relation. self_revert[elem] + 1 }) else: relation.self_revert.update( {elem: 1}) #print "relation.revert", word_prev.value, word_prev.deleted, relation.revert, revision_curr.wikipedia_id if (revision_prev.wikipedia_id not in word_prev.used): if (elem in revisions.keys()): if (revisions[word_prev.revision]. contributor_name != revision_curr.contributor_name ): if (word_prev.revision in relation. reintroduced.keys()): relation.reintroduced.update({ word_prev.revision: relation.reintroduced[ word_prev.revision] + 1 }) else: relation.reintroduced.update( {word_prev.revision: 1}) else: if (word_prev.revision in relation. self_reintroduced.keys()): relation.self_reintroduced.update( { word_prev.revision: relation. self_reintroduced[ word_prev.revision] + 1 }) else: relation.self_reintroduced.update( {word_prev.revision: 1}) # Add the sentence information to the paragraph. if (hash_curr in paragraph_curr.sentences.keys()): paragraph_curr.sentences[hash_curr].append( sentence_prev) paragraph_curr.ordered_sentences.append( sentence_prev.hash_value) else: paragraph_curr.sentences.update({ sentence_prev.hash_value: [sentence_prev] }) paragraph_curr.ordered_sentences.append( sentence_prev.hash_value) break elif (matched_all): sentence_prev.matched = True matched_sentences_prev.append(sentence_prev) # If the sentence did not match, then include in the container of unmatched sentences for further analysis. if (not matched_curr): sentence_curr = Sentence() sentence_curr.value = sentence sentence_curr.hash_value = hash_curr paragraph_curr.ordered_sentences.append( sentence_curr.hash_value) if (sentence_curr.hash_value in paragraph_curr.sentences.keys()): paragraph_curr.sentences[sentence_curr.hash_value].append( sentence_curr) else: paragraph_curr.sentences.update( {sentence_curr.hash_value: [sentence_curr]}) unmatched_sentences_curr.append(sentence_curr) # Identify the unmatched sentences in the previous paragraph revision. for paragraph_prev in unmatched_paragraphs_prev: for sentence_prev_hash in paragraph_prev.ordered_sentences: for sentence_prev in paragraph_prev.sentences[sentence_prev_hash]: if (not sentence_prev.matched): unmatched_sentences_prev.append(sentence_prev) sentence_prev.matched = True matched_sentences_prev.append(sentence_prev) return (unmatched_sentences_curr, unmatched_sentences_prev, matched_sentences_prev, total_sentences)
def analyseArticle(file_name): # Container of relationships. relations = {} # Revisions to compare. revision_curr = Revision() revision_prev = Revision() text_curr = None # Access the file. dumpIterator = dump.Iterator(file_name) # Iterate over the pages. for page in dumpIterator.readPages(): i = 0 # Iterate over revisions of the article. for revision in page.readRevisions(): vandalism = False #print "processing rev", revision.getId() # Update the information about the previous revision. revision_prev = revision_curr if (revision.getSha1() == None): revision.setSha1(Text.calculateHash(revision.getText().encode("utf-8"))) if (revision.getSha1() in spam): vandalism = True #TODO: SPAM detection: DELETION if (revision.getComment()!= None and revision.getComment().find(FLAG) > 0): pass else: if (revision_prev.length > PREVIOUS_LENGTH) and (len(revision.getText()) < CURR_LENGTH) and (((len(revision.getText())-revision_prev.length)/float(revision_prev.length)) <= CHANGE_PERCENTAGE): vandalism = True revision_curr = revision_prev #if (vandalism): #print "---------------------------- FLAG 1" #print "SPAM", revision.getId() #print revision.getText() #print if (not vandalism): # Information about the current revision. revision_curr = Revision() revision_curr.id = i revision_curr.wikipedia_id = int(revision.getId()) revision_curr.length = len(revision.getText()) revision_curr.timestamp = revision.getTimestamp() revision_curr.comment = revision.getComment() # Relation of the current relation. relation = Relation() relation.revision = int(revision.getId()) relation.length = len(revision.getText()) # Some revisions don't have contributor. if (revision.getContributor() != None): revision_curr.contributor_id = revision.getContributor().getId() revision_curr.contributor_name = revision.getContributor().getUsername().encode('utf-8') relation.author = revision.getContributor().getUsername().encode('utf-8') else: revision_curr.contributor_id = 'Not Available ' + revision.getId() revision_curr.contribur_name = 'Not Available ' + revision.getId() relation.author = 'Not Available ' + revision.getId() # Content within the revision. text_curr = revision.getText().encode('utf-8') text_curr = text_curr.lower() revision_curr.content = text_curr # Perform comparison. vandalism = determineAuthorship(revision_curr, revision_prev, text_curr, relation) if (not vandalism): #print "NOT SPAM", revision.getId() # Add the current revision with all the information. revisions.update({revision_curr.wikipedia_id : revision_curr}) relations.update({revision_curr.wikipedia_id : relation}) revision_order.append((revision_curr.wikipedia_id, False)) # Update the fake revision id. i = i+1 # Calculate the number of tokens in the revision. total = 0 for p in revision_curr.ordered_paragraphs: for paragraph_curr in revision_curr.paragraphs[p]: for hash_sentence_curr in paragraph_curr.sentences.keys(): for sentence_curr in paragraph_curr.sentences[hash_sentence_curr]: total = total + len(sentence_curr.words) revision_curr.total_tokens = total relation.total_tokens = total else: #print "---------------------------- FLAG 2" #print "SPAM", revision.getId() #print revision.getText() #print revision_order.append((revision_curr.wikipedia_id, True)) revision_curr = revision_prev spam.append(revision.getSha1()) return (revisions, revision_order, relations)
def analyseParagraphsInRevision(revision_curr, revision_prev, text_curr, relation): # Containers for unmatched and matched paragraphs. unmatched_paragraphs_curr = [] unmatched_paragraphs_prev = [] matched_paragraphs_prev = [] # Split the text of the current into paragraphs. paragraphs = Text.splitIntoParagraphs(text_curr) # Iterate over the paragraphs of the current version. for paragraph in paragraphs: # Build Paragraph structure and calculate hash value. paragraph = paragraph.strip() hash_curr = Text.calculateHash(paragraph) matched_curr = False # If the paragraph is in the previous revision, # update the authorship information and mark both paragraphs as matched (also in HT). if (hash_curr in revision_prev.ordered_paragraphs): for paragraph_prev in revision_prev.paragraphs[hash_curr]: if (not paragraph_prev.matched): matched_curr = True paragraph_prev.matched = True matched_paragraphs_prev.append(paragraph_prev) # TODO: added this (CHECK). for hash_sentence_prev in paragraph_prev.sentences.keys(): for sentence_prev in paragraph_prev.sentences[ hash_sentence_prev]: sentence_prev.matched = True for word_prev in sentence_prev.words: word_prev.matched = True word_prev.used.append( revision_curr.wikipedia_id) #if (word_prev.revision in relation.reintroduced.keys()): # relation.reintroduced.update({word_prev.revision : relation.reintroduced[word_prev.revision] + 1 }) #else: # relation.reintroduced.update({word_prev.revision : 1 }) # Add paragraph to current revision. if (hash_curr in revision_curr.paragraphs.keys()): revision_curr.paragraphs[ paragraph_prev.hash_value].append(paragraph_prev) revision_curr.ordered_paragraphs.append( paragraph_prev.hash_value) else: revision_curr.paragraphs.update( {paragraph_prev.hash_value: [paragraph_prev]}) revision_curr.ordered_paragraphs.append( paragraph_prev.hash_value) break # If the paragraph is not in the previous revision, but it is in an older revision # update the authorship information and mark both paragraphs as matched. if ((not matched_curr) and (hash_curr in paragraphs_ht)): for paragraph_prev in paragraphs_ht[hash_curr]: if (not paragraph_prev.matched): matched_curr = True paragraph_prev.matched = True matched_paragraphs_prev.append(paragraph_prev) # TODO: added this (CHECK). for hash_sentence_prev in paragraph_prev.sentences.keys(): for sentence_prev in paragraph_prev.sentences[ hash_sentence_prev]: sentence_prev.matched = True for word_prev in sentence_prev.words: word_prev.matched = True word_prev.used.append( revision_curr.wikipedia_id) if (revision_prev.wikipedia_id not in word_prev.used): word_prev.freq.append( revision_curr.wikipedia_id) # Revert: reintroducing something that somebody else deleted, # (and was not used in the previous revision) if (revision_prev.wikipedia_id not in word_prev.used): #if (revision_curr.wikipedia_id == 11): # print "Revert in 11", word_prev.value, word_prev.deleted, relation.revert for elem in word_prev.deleted: if (elem in revisions.keys()): if (revisions[elem]. contributor_name != revision_curr. contributor_name): if (elem in relation.revert. keys()): relation.revert.update({ elem: relation.revert[elem] + 1 }) else: relation.revert.update( {elem: 1}) else: if (elem in relation. self_revert.keys()): relation.self_revert.update( { elem: relation. self_revert[elem] + 1 }) else: relation.self_revert.update( {elem: 1}) if (revision_prev.wikipedia_id not in word_prev.used): if (elem in revisions.keys()): if (revisions[word_prev.revision]. contributor_name != revision_curr.contributor_name ): if (word_prev.revision in relation. reintroduced.keys()): relation.reintroduced.update({ word_prev.revision: relation.reintroduced[ word_prev.revision] + 1 }) else: relation.reintroduced.update( {word_prev.revision: 1}) else: if (word_prev.revision in relation. self_reintroduced.keys()): relation.self_reintroduced.update( { word_prev.revision: relation. self_reintroduced[ word_prev.revision] + 1 }) else: relation.self_reintroduced.update( {word_prev.revision: 1}) # Add paragraph to current revision. if (hash_curr in revision_curr.paragraphs.keys()): revision_curr.paragraphs[ paragraph_prev.hash_value].append(paragraph_prev) revision_curr.ordered_paragraphs.append( paragraph_prev.hash_value) else: revision_curr.paragraphs.update( {paragraph_prev.hash_value: [paragraph_prev]}) revision_curr.ordered_paragraphs.append( paragraph_prev.hash_value) break # If the paragraph did not match with previous revisions, # add to container of unmatched paragraphs for further analysis. if (not matched_curr): paragraph_curr = Paragraph() paragraph_curr.hash_value = Text.calculateHash(paragraph) paragraph_curr.value = paragraph revision_curr.ordered_paragraphs.append(paragraph_curr.hash_value) if (paragraph_curr.hash_value in revision_curr.paragraphs.keys()): revision_curr.paragraphs[paragraph_curr.hash_value].append( paragraph_curr) else: revision_curr.paragraphs.update( {paragraph_curr.hash_value: [paragraph_curr]}) unmatched_paragraphs_curr.append(paragraph_curr) # Identify unmatched paragraphs in previous revision for further analysis. for paragraph_prev_hash in revision_prev.ordered_paragraphs: for paragraph_prev in revision_prev.paragraphs[paragraph_prev_hash]: if (not paragraph_prev.matched): unmatched_paragraphs_prev.append(paragraph_prev) return (unmatched_paragraphs_curr, unmatched_paragraphs_prev, matched_paragraphs_prev)
def analyseParagraphsInRevision(revision_curr, revision_prev, text_curr, relation): # Containers for unmatched and matched paragraphs. unmatched_paragraphs_curr = [] unmatched_paragraphs_prev = [] matched_paragraphs_prev = [] # Split the text of the current into paragraphs. paragraphs = Text.splitIntoParagraphs(text_curr) # Iterate over the paragraphs of the current version. for paragraph in paragraphs: # Build Paragraph structure and calculate hash value. paragraph = paragraph.strip() hash_curr = Text.calculateHash(paragraph) matched_curr = False # If the paragraph is in the previous revision, # update the authorship information and mark both paragraphs as matched (also in HT). if (hash_curr in revision_prev.ordered_paragraphs): for paragraph_prev in revision_prev.paragraphs[hash_curr]: if (not paragraph_prev.matched): matched_curr = True paragraph_prev.matched = True matched_paragraphs_prev.append(paragraph_prev) # TODO: added this (CHECK). for hash_sentence_prev in paragraph_prev.sentences.keys(): for sentence_prev in paragraph_prev.sentences[hash_sentence_prev]: sentence_prev.matched = True for word_prev in sentence_prev.words: word_prev.matched = True word_prev.used.append(revision_curr.wikipedia_id) #if (word_prev.revision in relation.reintroduced.keys()): # relation.reintroduced.update({word_prev.revision : relation.reintroduced[word_prev.revision] + 1 }) #else: # relation.reintroduced.update({word_prev.revision : 1 }) # Add paragraph to current revision. if (hash_curr in revision_curr.paragraphs.keys()): revision_curr.paragraphs[paragraph_prev.hash_value].append(paragraph_prev) revision_curr.ordered_paragraphs.append(paragraph_prev.hash_value) else: revision_curr.paragraphs.update({paragraph_prev.hash_value : [paragraph_prev]}) revision_curr.ordered_paragraphs.append(paragraph_prev.hash_value) break # If the paragraph is not in the previous revision, but it is in an older revision # update the authorship information and mark both paragraphs as matched. if ((not matched_curr) and (hash_curr in paragraphs_ht)): for paragraph_prev in paragraphs_ht[hash_curr]: if (not paragraph_prev.matched): matched_curr = True paragraph_prev.matched = True matched_paragraphs_prev.append(paragraph_prev) # TODO: added this (CHECK). for hash_sentence_prev in paragraph_prev.sentences.keys(): for sentence_prev in paragraph_prev.sentences[hash_sentence_prev]: sentence_prev.matched = True for word_prev in sentence_prev.words: word_prev.matched = True word_prev.used.append(revision_curr.wikipedia_id) if (revision_prev.wikipedia_id not in word_prev.used): word_prev.freq.append(revision_curr.wikipedia_id) # Revert: reintroducing something that somebody else deleted, # (and was not used in the previous revision) if (revision_prev.wikipedia_id not in word_prev.used): #if (revision_curr.wikipedia_id == 11): # print "Revert in 11", word_prev.value, word_prev.deleted, relation.revert for elem in word_prev.deleted: if (elem in revisions.keys()): if (revisions[elem].contributor_name != revision_curr.contributor_name): if (elem in relation.revert.keys()): relation.revert.update({elem : relation.revert[elem] + 1}) else: relation.revert.update({elem : 1}) else: if (elem in relation.self_revert.keys()): relation.self_revert.update({elem : relation.self_revert[elem] + 1}) else: relation.self_revert.update({elem : 1}) if (revision_prev.wikipedia_id not in word_prev.used): if (elem in revisions.keys()): if (revisions[word_prev.revision].contributor_name != revision_curr.contributor_name): if (word_prev.revision in relation.reintroduced.keys()): relation.reintroduced.update({word_prev.revision : relation.reintroduced[word_prev.revision] + 1 }) else: relation.reintroduced.update({word_prev.revision : 1 }) else: if (word_prev.revision in relation.self_reintroduced.keys()): relation.self_reintroduced.update({word_prev.revision : relation.self_reintroduced[word_prev.revision] + 1}) else: relation.self_reintroduced.update({word_prev.revision : 1}) # Add paragraph to current revision. if (hash_curr in revision_curr.paragraphs.keys()): revision_curr.paragraphs[paragraph_prev.hash_value].append(paragraph_prev) revision_curr.ordered_paragraphs.append(paragraph_prev.hash_value) else: revision_curr.paragraphs.update({paragraph_prev.hash_value : [paragraph_prev]}) revision_curr.ordered_paragraphs.append(paragraph_prev.hash_value) break # If the paragraph did not match with previous revisions, # add to container of unmatched paragraphs for further analysis. if (not matched_curr): paragraph_curr = Paragraph() paragraph_curr.hash_value = Text.calculateHash(paragraph) paragraph_curr.value = paragraph revision_curr.ordered_paragraphs.append(paragraph_curr.hash_value) if (paragraph_curr.hash_value in revision_curr.paragraphs.keys()): revision_curr.paragraphs[paragraph_curr.hash_value].append(paragraph_curr) else: revision_curr.paragraphs.update({paragraph_curr.hash_value : [paragraph_curr]}) unmatched_paragraphs_curr.append(paragraph_curr) # Identify unmatched paragraphs in previous revision for further analysis. for paragraph_prev_hash in revision_prev.ordered_paragraphs: for paragraph_prev in revision_prev.paragraphs[paragraph_prev_hash]: if (not paragraph_prev.matched): unmatched_paragraphs_prev.append(paragraph_prev) return (unmatched_paragraphs_curr, unmatched_paragraphs_prev, matched_paragraphs_prev)
def analyseSentencesInParagraphs(unmatched_paragraphs_curr, unmatched_paragraphs_prev, revision_curr, revision_prev, relation): # Containers for unmatched and matched sentences. unmatched_sentences_curr = [] unmatched_sentences_prev = [] matched_sentences_prev = [] total_sentences = 0 # Iterate over the unmatched paragraphs of the current revision. for paragraph_curr in unmatched_paragraphs_curr: # Split the current paragraph into sentences. sentences = Text.splitIntoSentences(paragraph_curr.value) # Iterate over the sentences of the current paragraph for sentence in sentences: # Create the Sentence structure. sentence = sentence.strip() sentence = ' '.join(Text.splitIntoWords(sentence)) hash_curr = Text.calculateHash(sentence) matched_curr = False total_sentences = total_sentences + 1 # Iterate over the unmatched paragraphs from the previous revision. for paragraph_prev in unmatched_paragraphs_prev: if (hash_curr in paragraph_prev.sentences.keys()): for sentence_prev in paragraph_prev.sentences[hash_curr]: if (not sentence_prev.matched): matched_one = False matched_all = True for word_prev in sentence_prev.words: if (word_prev.matched): matched_one = True else: matched_all = False if not(matched_one): sentence_prev.matched = True matched_curr = True matched_sentences_prev.append(sentence_prev) # TODO: CHECK this for word_prev in sentence_prev.words: word_prev.matched = True word_prev.used.append(revision_curr.wikipedia_id) #if (word_prev.revision in relation.reintroduced.keys()): # relation.reintroduced.update({word_prev.revision : relation.reintroduced[word_prev.revision] + 1 }) #else: # relation.reintroduced.update({word_prev.revision : 1 }) # Add the sentence information to the paragraph. if (hash_curr in paragraph_curr.sentences.keys()): paragraph_curr.sentences[hash_curr].append(sentence_prev) paragraph_curr.ordered_sentences.append(sentence_prev.hash_value) else: paragraph_curr.sentences.update({sentence_prev.hash_value : [sentence_prev]}) paragraph_curr.ordered_sentences.append(sentence_prev.hash_value) break elif (matched_all): sentence_prev.matched = True matched_sentences_prev.append(sentence_prev) if (matched_curr): break # Iterate over the hash table of sentences from old revisions. if ((not matched_curr) and (hash_curr in sentences_ht.keys())): for sentence_prev in sentences_ht[hash_curr]: if (not sentence_prev.matched): matched_one = False matched_all = True for word_prev in sentence_prev.words: if (word_prev.matched): matched_one = True else: matched_all = False if not(matched_one): sentence_prev.matched = True matched_curr = True matched_sentences_prev.append(sentence_prev) # TODO: CHECK this for word_prev in sentence_prev.words: word_prev.matched = True word_prev.used.append(revision_curr.wikipedia_id) if (revision_prev.wikipedia_id not in word_prev.used): word_prev.freq.append(revision_curr.wikipedia_id) # Revert: reintroducing something that somebody else deleted if (revision_prev.wikipedia_id not in word_prev.used): for elem in word_prev.deleted: #if (revision_curr.wikipedia_id == 11): # print "Revert in 11", word_prev.value, word_prev.deleted, relation.revert if (elem in revisions.keys()): if (revisions[elem].contributor_name != revision_curr.contributor_name): if (elem in relation.revert.keys()): relation.revert.update({elem : relation.revert[elem] + 1}) else: relation.revert.update({elem : 1}) else: if (elem in relation.self_revert.keys()): relation.self_revert.update({elem : relation.self_revert[elem] + 1}) else: relation.self_revert.update({elem : 1}) #print "relation.revert", word_prev.value, word_prev.deleted, relation.revert, revision_curr.wikipedia_id if (revision_prev.wikipedia_id not in word_prev.used): if (elem in revisions.keys()): if (revisions[word_prev.revision].contributor_name != revision_curr.contributor_name): if (word_prev.revision in relation.reintroduced.keys()): relation.reintroduced.update({word_prev.revision : relation.reintroduced[word_prev.revision] + 1 }) else: relation.reintroduced.update({word_prev.revision : 1 }) else: if (word_prev.revision in relation.self_reintroduced.keys()): relation.self_reintroduced.update({word_prev.revision : relation.self_reintroduced[word_prev.revision] + 1}) else: relation.self_reintroduced.update({word_prev.revision : 1}) # Add the sentence information to the paragraph. if (hash_curr in paragraph_curr.sentences.keys()): paragraph_curr.sentences[hash_curr].append(sentence_prev) paragraph_curr.ordered_sentences.append(sentence_prev.hash_value) else: paragraph_curr.sentences.update({sentence_prev.hash_value : [sentence_prev]}) paragraph_curr.ordered_sentences.append(sentence_prev.hash_value) break elif (matched_all): sentence_prev.matched = True matched_sentences_prev.append(sentence_prev) # If the sentence did not match, then include in the container of unmatched sentences for further analysis. if (not matched_curr): sentence_curr = Sentence() sentence_curr.value = sentence sentence_curr.hash_value = hash_curr paragraph_curr.ordered_sentences.append(sentence_curr.hash_value) if (sentence_curr.hash_value in paragraph_curr.sentences.keys()): paragraph_curr.sentences[sentence_curr.hash_value].append(sentence_curr) else: paragraph_curr.sentences.update({sentence_curr.hash_value : [sentence_curr]}) unmatched_sentences_curr.append(sentence_curr) # Identify the unmatched sentences in the previous paragraph revision. for paragraph_prev in unmatched_paragraphs_prev: for sentence_prev_hash in paragraph_prev.ordered_sentences: for sentence_prev in paragraph_prev.sentences[sentence_prev_hash]: if (not sentence_prev.matched): unmatched_sentences_prev.append(sentence_prev) sentence_prev.matched = True matched_sentences_prev.append(sentence_prev) return (unmatched_sentences_curr, unmatched_sentences_prev, matched_sentences_prev, total_sentences)
def analyseArticle(file_name): # Container of revisions. revisions = {} revision_order = [] # Revisions to compare. revision_curr = Revision() revision_prev = Revision() text_curr = None # Access the file. dumpIterator = dump.Iterator(file_name) # Iterate over the pages. for page in dumpIterator.readPages(): i = 0 # Iterate over revisions of the article. for revision in page.readRevisions(): vandalism = False # Update the information about the previous revision. revision_prev = revision_curr if (revision.getSha1() == None): revision.setSha1( Text.calculateHash(revision.getText().encode("utf-8"))) if (revision.getSha1() in spam): vandalism = True #TODO: SPAM detection: DELETION if (revision.getComment() != None and revision.getComment().find(FLAG) > 0): pass else: if (revision_prev.length > PREVIOUS_LENGTH) and (len( revision.getText()) < CURR_LENGTH) and (( (len(revision.getText()) - revision_prev.length) / float(revision_prev.length)) <= CHANGE_PERCENTAGE): print "VANDALISM: CHANGE PERCETANGE" vandalism = True #if (vandalism): #print "---------------------------- FLAG 1" #print revision.getId() #print revision.getText() #print if (not vandalism): # Information about the current revision. revision_curr = Revision() revision_curr.id = i revision_curr.wikipedia_id = int(revision.getId()) revision_curr.length = len(revision.getText()) # Some revisions don't have contributor. if (revision.getContributor() != None): revision_curr.contributor_id = revision.getContributor( ).getId() revision_curr.contributor_name = revision.getContributor( ).getUsername() else: revision_curr.contributor_id = 'Not Available' revision_curr.contribur_name = 'Not Available' # Content within the revision. text_curr = revision.getText().encode('utf-8') text_curr = text_curr.lower() revision_curr.content = text_curr # Perform comparison. vandalism = determineAuthorship(revision_curr, revision_prev, text_curr) if (not vandalism): # Add the current revision with all the information. revisions.update( {revision_curr.wikipedia_id: revision_curr}) # Update the fake revision id. i = i + 1 # Update the index of processed revisions. revision_order.append((revision_curr.wikipedia_id, False)) else: #print "detected vandalism in here ...................................." #print "---------------------------- FLAG 2" #print revision.getId() #print revision.getText() #print spam.append(revision.getSha1()) revision_order.append((revision_curr.wikipedia_id, True)) revision_curr = revision_prev else: # revision.getText() # #print spam.append(revision.getSha1()) revision_order.append((revision_curr.wikipedia_id, True)) revision_curr = revision_prev return (revisions, revision_order)
def analyseSentencesInParagraphs(unmatched_paragraphs_curr, unmatched_paragraphs_prev, revision_curr): # Containers for unmatched and matched sentences. unmatched_sentences_curr = [] unmatched_sentences_prev = [] matched_sentences_prev = [] total_sentences = 0 # Iterate over the unmatched paragraphs of the current revision. for paragraph_curr in unmatched_paragraphs_curr: # Split the current paragraph into sentences. sentences = Text.splitIntoSentences(paragraph_curr.value) # Iterate over the sentences of the current paragraph for sentence in sentences: # Create the Sentence structure. sentence = sentence.strip() sentence = ' '.join(Text.splitIntoWords(sentence)) hash_curr = Text.calculateHash(sentence) matched_curr = False total_sentences = total_sentences + 1 # Iterate over the unmatched paragraphs from the previous revision. for paragraph_prev in unmatched_paragraphs_prev: if (hash_curr in paragraph_prev.sentences.keys()): for sentence_prev in paragraph_prev.sentences[hash_curr]: if (not sentence_prev.matched): matched_one = False matched_all = True for word_prev in sentence_prev.words: if (word_prev.matched): matched_one = True else: matched_all = False if not (matched_one): sentence_prev.matched = True matched_curr = True matched_sentences_prev.append(sentence_prev) # TODO: CHECK this for word_prev in sentence_prev.words: word_prev.matched = True # Add the sentence information to the paragraph. if (hash_curr in paragraph_curr.sentences.keys()): paragraph_curr.sentences[hash_curr].append( sentence_prev) paragraph_curr.ordered_sentences.append( sentence_prev.hash_value) else: paragraph_curr.sentences.update({ sentence_prev.hash_value: [sentence_prev] }) paragraph_curr.ordered_sentences.append( sentence_prev.hash_value) break elif (matched_all): sentence_prev.matched = True matched_sentences_prev.append(sentence_prev) if (matched_curr): break # Iterate over the hash table of sentences from old revisions. if ((not matched_curr) and (hash_curr in sentences_ht.keys())): for sentence_prev in sentences_ht[hash_curr]: if (not sentence_prev.matched): matched_one = False matched_all = True for word_prev in sentence_prev.words: if (word_prev.matched): matched_one = True else: matched_all = False if not (matched_one): sentence_prev.matched = True matched_curr = True matched_sentences_prev.append(sentence_prev) # TODO: CHECK this for word_prev in sentence_prev.words: word_prev.matched = True # Add the sentence information to the paragraph. if (hash_curr in paragraph_curr.sentences.keys()): paragraph_curr.sentences[hash_curr].append( sentence_prev) paragraph_curr.ordered_sentences.append( sentence_prev.hash_value) else: paragraph_curr.sentences.update({ sentence_prev.hash_value: [sentence_prev] }) paragraph_curr.ordered_sentences.append( sentence_prev.hash_value) break elif (matched_all): sentence_prev.matched = True matched_sentences_prev.append(sentence_prev) # If the sentence did not match, then include in the container of unmatched sentences for further analysis. if (not matched_curr): sentence_curr = Sentence() sentence_curr.value = sentence sentence_curr.hash_value = hash_curr paragraph_curr.ordered_sentences.append( sentence_curr.hash_value) if (sentence_curr.hash_value in paragraph_curr.sentences.keys()): paragraph_curr.sentences[sentence_curr.hash_value].append( sentence_curr) else: paragraph_curr.sentences.update( {sentence_curr.hash_value: [sentence_curr]}) unmatched_sentences_curr.append(sentence_curr) # Identify the unmatched sentences in the previous paragraph revision. for paragraph_prev in unmatched_paragraphs_prev: for sentence_prev_hash in paragraph_prev.ordered_sentences: for sentence_prev in paragraph_prev.sentences[sentence_prev_hash]: if (not sentence_prev.matched): unmatched_sentences_prev.append(sentence_prev) sentence_prev.matched = True matched_sentences_prev.append(sentence_prev) return (unmatched_sentences_curr, unmatched_sentences_prev, matched_sentences_prev, total_sentences)
def analyseParagraphsInRevision(revision_curr, revision_prev, text_curr): # Containers for unmatched and matched paragraphs. unmatched_paragraphs_curr = [] unmatched_paragraphs_prev = [] matched_paragraphs_prev = [] # Split the text of the current into paragraphs. paragraphs = Text.splitIntoParagraphs(text_curr) # Iterate over the paragraphs of the current version. for paragraph in paragraphs: # Build Paragraph structure and calculate hash value. paragraph = paragraph.strip() hash_curr = Text.calculateHash(paragraph) matched_curr = False # If the paragraph is in the previous revision, # update the authorship information and mark both paragraphs as matched (also in HT). if (hash_curr in revision_prev.ordered_paragraphs): for paragraph_prev in revision_prev.paragraphs[hash_curr]: if (not paragraph_prev.matched): matched_curr = True paragraph_prev.matched = True matched_paragraphs_prev.append(paragraph_prev) # TODO: added this (CHECK). for hash_sentence_prev in paragraph_prev.sentences.keys(): for sentence_prev in paragraph_prev.sentences[ hash_sentence_prev]: sentence_prev.matched = True for word_prev in sentence_prev.words: word_prev.matched = True # Add paragraph to current revision. if (hash_curr in revision_curr.paragraphs.keys()): revision_curr.paragraphs[ paragraph_prev.hash_value].append(paragraph_prev) revision_curr.ordered_paragraphs.append( paragraph_prev.hash_value) else: revision_curr.paragraphs.update( {paragraph_prev.hash_value: [paragraph_prev]}) revision_curr.ordered_paragraphs.append( paragraph_prev.hash_value) break # If the paragraph is not in the previous revision, but it is in an older revision # update the authorship information and mark both paragraphs as matched. if ((not matched_curr) and (hash_curr in paragraphs_ht)): for paragraph_prev in paragraphs_ht[hash_curr]: if (not paragraph_prev.matched): matched_curr = True paragraph_prev.matched = True matched_paragraphs_prev.append(paragraph_prev) # TODO: added this (CHECK). for hash_sentence_prev in paragraph_prev.sentences.keys(): for sentence_prev in paragraph_prev.sentences[ hash_sentence_prev]: sentence_prev.matched = True for word_prev in sentence_prev.words: word_prev.matched = True # Add paragraph to current revision. if (hash_curr in revision_curr.paragraphs.keys()): revision_curr.paragraphs[ paragraph_prev.hash_value].append(paragraph_prev) revision_curr.ordered_paragraphs.append( paragraph_prev.hash_value) else: revision_curr.paragraphs.update( {paragraph_prev.hash_value: [paragraph_prev]}) revision_curr.ordered_paragraphs.append( paragraph_prev.hash_value) break # If the paragraph did not match with previous revisions, # add to container of unmatched paragraphs for further analysis. if (not matched_curr): paragraph_curr = Paragraph() paragraph_curr.hash_value = Text.calculateHash(paragraph) paragraph_curr.value = paragraph revision_curr.ordered_paragraphs.append(paragraph_curr.hash_value) if (paragraph_curr.hash_value in revision_curr.paragraphs.keys()): revision_curr.paragraphs[paragraph_curr.hash_value].append( paragraph_curr) else: revision_curr.paragraphs.update( {paragraph_curr.hash_value: [paragraph_curr]}) unmatched_paragraphs_curr.append(paragraph_curr) # Identify unmatched paragraphs in previous revision for further analysis. for paragraph_prev_hash in revision_prev.ordered_paragraphs: for paragraph_prev in revision_prev.paragraphs[paragraph_prev_hash]: if (not paragraph_prev.matched): unmatched_paragraphs_prev.append(paragraph_prev) return (unmatched_paragraphs_curr, unmatched_paragraphs_prev, matched_paragraphs_prev)
def analyseArticle(file_name): # Container of relationships. relations = {} # Revisions to compare. revision_curr = Revision() revision_prev = Revision() text_curr = None # Access the file. dumpIterator = mwIterator.from_file(open_file(file_name)) # Iterate over the pages. for page in dumpIterator: i = 0 # Iterate over revisions of the article. for revision in page: vandalism = False # Update the information about the previous revision. revision_prev = revision_curr if (revision.sha1 == None): revision.sha1 = Text.calculateHash(revision.text) if (revision.sha1 in spam): vandalism = True #TODO: SPAM detection: DELETION if (revision.comment!= None and revision.comment.find(FLAG) > 0): pass else: if (revision_prev.length > PREVIOUS_LENGTH) and (len(revision.text) < CURR_LENGTH) and (((len(revision.text)-revision_prev.length)/float(revision_prev.length)) <= CHANGE_PERCENTAGE): vandalism = True revision_curr = revision_prev if (not vandalism): # Information about the current revision. revision_curr = Revision() revision_curr.id = i revision_curr.wikipedia_id = int(revision.id) revision_curr.length = len(revision.text) revision_curr.timestamp = revision.timestamp # Relation of the current relation. relation = Relation() relation.revision = int(revision.id) relation.length = len(revision.text) # Some revisions don't have contributor. if (revision.contributor != None): revision_curr.contributor_id = revision.contributor.id revision_curr.contributor_name = revision.contributor.user_text relation.author = revision.contributor.user_text else: revision_curr.contributor_id = 'Not Available ' + revision.id revision_curr.contribur_name = 'Not Available ' + revision.id relation.author = 'Not Available ' + revision.id # Content within the revision. text_curr = revision.text.lower() # Perform comparison. vandalism = determineAuthorship(revision_curr, revision_prev, text_curr, relation) if (not vandalism): # Add the current revision with all the information. revisions.update({revision_curr.wikipedia_id : revision_curr}) relations.update({revision_curr.wikipedia_id : relation}) revision_order.append((revision_curr.wikipedia_id, False)) # Update the fake revision id. i = i+1 # Calculate the number of tokens in the revision. total = 0 for p in revision_curr.ordered_paragraphs: for paragraph_curr in revision_curr.paragraphs[p]: for hash_sentence_curr in paragraph_curr.sentences.keys(): for sentence_curr in paragraph_curr.sentences[hash_sentence_curr]: total = total + len(sentence_curr.words) revision_curr.total_tokens = total relation.total_tokens = total else: revision_order.append((revision_curr.wikipedia_id, True)) revision_curr = revision_prev spam.append(revision.sha1) return (revisions, revision_order, relations)