Exemple #1
0
 def __init__(self, page):
     ''' 
         Simple constructor that looks for a ref and logs some stats about 
         this article. We ignore revisions for now, and only work with the 
         current rev. Later on, this should receive a number of revisions and 
         parse each of them, only checking for refs on the current rev. 
         
         We also assume that no numRedirects are passed into this constructor, 
         as the stats we are logging would be useless for those.  
     '''
     
     ns, tag = string.split(page.tag, '}', 1)
     
     title = page.find(ns + "}title")
     revision = page.find(ns + "}revision")
     textNode = revision.find(ns + "}text")
     
     # Parses the text and logs the article if it's missing a reftag or template
     t = textNode.text
     hasRef = self.hasRefTag(t)
     hasRefSection = self.hasRefSection(t)
     
     self.length = len(t)
     self.numlinks = len(re.findall('\[\[', t))
     self.numtemplates = len(re.findall('\{\{', t))
     
     # Regex for extlinks should be fixed and look for any single [ 
     # not preceded by another where alphanums follows.
     self.numextlinks = len(re.findall('\[\ ?(http|https|www)', t))
     self.imagecount = len(re.findall(r'\[\[(File|Image)\:', t))
     
     if hasRef == False:
         if hasRefSection == False:
             DataLogger.l("/tmp/wp_missing_norefsection.txt", "Missing: [[" + title.text + "]]")
         else:
             DataLogger.l("/tmp/wp_missing_refsection.txt", "Missing: [[" + title.text + "]]")
Exemple #2
0
 def log(self):
     DataLogger.l("/tmp/wplog.txt", "Num redirects: " + str(self.numRedirects))
     DataLogger.l("/tmp/wplog.txt", "Num dabs: " + str(self.numDabs))
     DataLogger.l("/tmp/wplog.txt", "Num lists: " + str(self.numLists))
     DataLogger.l("/tmp/wplog.txt", "Num pages per ns: " + pprint.pformat(self.nscounts))
     DataLogger.l("/tmp/wplog.txt", "Avg length: " + pprint.pformat(self.lengthTotal / len(self.articlestats)))
     DataLogger.l("/tmp/wplog.txt", "Avg images per article: " + pprint.pformat(self.numImgsTotal / len(self.articlestats)))
     DataLogger.l("/tmp/wplog.txt", "Avg extlinks: " + pprint.pformat(self.numExtLinksTotal / len(self.articlestats)))
     DataLogger.l("/tmp/wplog.txt", "Avg wplinks: " + pprint.pformat(self.numLinksTotal / len(self.articlestats)))
     DataLogger.l("/tmp/wplog.txt", "Avg templates: " + pprint.pformat(self.numTemplatesTotal / len(self.articlestats)))
Exemple #3
0
 def log(self):
     DataLogger.l("/tmp/wplog.txt", "Avg templates: " + pprint.pformat(self.numTemplatesTotal / len(self.articlestats)))