def analyseArticle(file_name): # Container of relationships. relations = {} # Revisions to compare. revision_curr = Revision() revision_prev = Revision() text_curr = None # Access the file. dumpIterator = dump.Iterator(file_name) # Iterate over the pages. for page in dumpIterator.readPages(): i = 0 # Iterate over revisions of the article. for revision in page.readRevisions(): vandalism = False #print "processing rev", revision.getId() # Update the information about the previous revision. revision_prev = revision_curr if (revision.getSha1() == None): revision.setSha1( Text.calculateHash(revision.getText().encode("utf-8"))) if (revision.getSha1() in spam): vandalism = True #TODO: SPAM detection: DELETION if (revision.getComment() != None and revision.getComment().find(FLAG) > 0): pass else: if (revision_prev.length > PREVIOUS_LENGTH) and (len( revision.getText()) < CURR_LENGTH) and (( (len(revision.getText()) - revision_prev.length) / float(revision_prev.length)) <= CHANGE_PERCENTAGE): vandalism = True revision_curr = revision_prev #if (vandalism): #print "---------------------------- FLAG 1" #print "SPAM", revision.getId() #print revision.getText() #print if (not vandalism): # Information about the current revision. revision_curr = Revision() revision_curr.id = i revision_curr.wikipedia_id = int(revision.getId()) revision_curr.length = len(revision.getText()) revision_curr.timestamp = revision.getTimestamp() revision_curr.comment = revision.getComment() # Relation of the current relation. relation = Relation() relation.revision = int(revision.getId()) relation.length = len(revision.getText()) # Some revisions don't have contributor. if (revision.getContributor() != None): revision_curr.contributor_id = revision.getContributor( ).getId() revision_curr.contributor_name = revision.getContributor( ).getUsername().encode('utf-8') relation.author = revision.getContributor().getUsername( ).encode('utf-8') else: revision_curr.contributor_id = 'Not Available ' + revision.getId( ) revision_curr.contribur_name = 'Not Available ' + revision.getId( ) relation.author = 'Not Available ' + revision.getId() # Content within the revision. text_curr = revision.getText().encode('utf-8') text_curr = text_curr.lower() revision_curr.content = text_curr # Perform comparison. vandalism = determineAuthorship(revision_curr, revision_prev, text_curr, relation) if (not vandalism): #print "NOT SPAM", revision.getId() # Add the current revision with all the information. revisions.update( {revision_curr.wikipedia_id: revision_curr}) relations.update({revision_curr.wikipedia_id: relation}) revision_order.append((revision_curr.wikipedia_id, False)) # Update the fake revision id. i = i + 1 # Calculate the number of tokens in the revision. total = 0 for p in revision_curr.ordered_paragraphs: for paragraph_curr in revision_curr.paragraphs[p]: for hash_sentence_curr in paragraph_curr.sentences.keys( ): for sentence_curr in paragraph_curr.sentences[ hash_sentence_curr]: total = total + len(sentence_curr.words) revision_curr.total_tokens = total relation.total_tokens = total else: #print "---------------------------- FLAG 2" #print "SPAM", revision.getId() #print revision.getText() #print revision_order.append((revision_curr.wikipedia_id, True)) revision_curr = revision_prev spam.append(revision.getSha1()) return (revisions, revision_order, relations)
def analyseArticle(file_name): # Container of relationships. relations = {} # Revisions to compare. revision_curr = Revision() revision_prev = Revision() text_curr = None # Access the file. dumpIterator = mwIterator.from_file(open_file(file_name)) # Iterate over the pages. for page in dumpIterator: i = 0 # Iterate over revisions of the article. for revision in page: vandalism = False # Update the information about the previous revision. revision_prev = revision_curr if (revision.sha1 == None): revision.sha1 = Text.calculateHash(revision.text) if (revision.sha1 in spam): vandalism = True #TODO: SPAM detection: DELETION if (revision.comment!= None and revision.comment.find(FLAG) > 0): pass else: if (revision_prev.length > PREVIOUS_LENGTH) and (len(revision.text) < CURR_LENGTH) and (((len(revision.text)-revision_prev.length)/float(revision_prev.length)) <= CHANGE_PERCENTAGE): vandalism = True revision_curr = revision_prev if (not vandalism): # Information about the current revision. revision_curr = Revision() revision_curr.id = i revision_curr.wikipedia_id = int(revision.id) revision_curr.length = len(revision.text) revision_curr.timestamp = revision.timestamp # Relation of the current relation. relation = Relation() relation.revision = int(revision.id) relation.length = len(revision.text) # Some revisions don't have contributor. if (revision.contributor != None): revision_curr.contributor_id = revision.contributor.id revision_curr.contributor_name = revision.contributor.user_text relation.author = revision.contributor.user_text else: revision_curr.contributor_id = 'Not Available ' + revision.id revision_curr.contribur_name = 'Not Available ' + revision.id relation.author = 'Not Available ' + revision.id # Content within the revision. text_curr = revision.text.lower() # Perform comparison. vandalism = determineAuthorship(revision_curr, revision_prev, text_curr, relation) if (not vandalism): # Add the current revision with all the information. revisions.update({revision_curr.wikipedia_id : revision_curr}) relations.update({revision_curr.wikipedia_id : relation}) revision_order.append((revision_curr.wikipedia_id, False)) # Update the fake revision id. i = i+1 # Calculate the number of tokens in the revision. total = 0 for p in revision_curr.ordered_paragraphs: for paragraph_curr in revision_curr.paragraphs[p]: for hash_sentence_curr in paragraph_curr.sentences.keys(): for sentence_curr in paragraph_curr.sentences[hash_sentence_curr]: total = total + len(sentence_curr.words) revision_curr.total_tokens = total relation.total_tokens = total else: revision_order.append((revision_curr.wikipedia_id, True)) revision_curr = revision_prev spam.append(revision.sha1) return (revisions, revision_order, relations)
def analyseArticle(file_name): # Container of relationships. relations = {} # Revisions to compare. revision_curr = Revision() revision_prev = Revision() text_curr = None # Access the file. dumpIterator = dump.Iterator(file_name) # Iterate over the pages. for page in dumpIterator.readPages(): i = 0 # Iterate over revisions of the article. for revision in page.readRevisions(): vandalism = False #print "processing rev", revision.getId() # Update the information about the previous revision. revision_prev = revision_curr if (revision.getSha1() == None): revision.setSha1(Text.calculateHash(revision.getText().encode("utf-8"))) if (revision.getSha1() in spam): vandalism = True #TODO: SPAM detection: DELETION if (revision.getComment()!= None and revision.getComment().find(FLAG) > 0): pass else: if (revision_prev.length > PREVIOUS_LENGTH) and (len(revision.getText()) < CURR_LENGTH) and (((len(revision.getText())-revision_prev.length)/float(revision_prev.length)) <= CHANGE_PERCENTAGE): vandalism = True revision_curr = revision_prev #if (vandalism): #print "---------------------------- FLAG 1" #print "SPAM", revision.getId() #print revision.getText() #print if (not vandalism): # Information about the current revision. revision_curr = Revision() revision_curr.id = i revision_curr.wikipedia_id = int(revision.getId()) revision_curr.length = len(revision.getText()) revision_curr.timestamp = revision.getTimestamp() revision_curr.comment = revision.getComment() # Relation of the current relation. relation = Relation() relation.revision = int(revision.getId()) relation.length = len(revision.getText()) # Some revisions don't have contributor. if (revision.getContributor() != None): revision_curr.contributor_id = revision.getContributor().getId() revision_curr.contributor_name = revision.getContributor().getUsername().encode('utf-8') relation.author = revision.getContributor().getUsername().encode('utf-8') else: revision_curr.contributor_id = 'Not Available ' + revision.getId() revision_curr.contribur_name = 'Not Available ' + revision.getId() relation.author = 'Not Available ' + revision.getId() # Content within the revision. text_curr = revision.getText().encode('utf-8') text_curr = text_curr.lower() revision_curr.content = text_curr # Perform comparison. vandalism = determineAuthorship(revision_curr, revision_prev, text_curr, relation) if (not vandalism): #print "NOT SPAM", revision.getId() # Add the current revision with all the information. revisions.update({revision_curr.wikipedia_id : revision_curr}) relations.update({revision_curr.wikipedia_id : relation}) revision_order.append((revision_curr.wikipedia_id, False)) # Update the fake revision id. i = i+1 # Calculate the number of tokens in the revision. total = 0 for p in revision_curr.ordered_paragraphs: for paragraph_curr in revision_curr.paragraphs[p]: for hash_sentence_curr in paragraph_curr.sentences.keys(): for sentence_curr in paragraph_curr.sentences[hash_sentence_curr]: total = total + len(sentence_curr.words) revision_curr.total_tokens = total relation.total_tokens = total else: #print "---------------------------- FLAG 2" #print "SPAM", revision.getId() #print revision.getText() #print revision_order.append((revision_curr.wikipedia_id, True)) revision_curr = revision_prev spam.append(revision.getSha1()) return (revisions, revision_order, relations)