def write_articles(self): for k, v in self.contents.iteritems(): if 'articles' in v: create_dir(k) cwd = os.getcwdu() os.chdir(k) for url, article in v['articles'].iteritems(): words = article['article'] + u'\n' self.write_page(words, article['title'] + ".htm") os.chdir(cwd)
def write(self): create_dir(self.date) cwd = os.getcwdu() os.chdir(self.date) self.write_cover() self.write_topnews_s() self.write_contents() self.write_topnews() self.write_articles() os.chdir(cwd)
def setup_epub_root(self): create_dir(self.epub_root) template_root = os.environ['PY_ROOT'] + os.sep + "epub_templates" copy(template_root + os.sep + "headerLogo.png", self.epub_root) self.book.master_head_image.path = self.book.master_head_image.name copy(template_root + os.sep + "mimetype", self.epub_root) copy(template_root + os.sep + "page_styles.css", self.epub_root) copy(template_root + os.sep + "stylesheet.css", self.epub_root) copy(template_root + os.sep + "titlepage.xhtml", self.epub_root) meta_inf = self.epub_root + os.sep + "META-INF" if not os.path.exists(meta_inf): copytree(template_root + os.sep + "META-INF", meta_inf) dl_path = self.epub_root + os.sep + self.book.cover.name self.book.cover.path = self.book.cover.name download_image(self.book.cover.url, dl_path)
def edit_ratio_histogram(self): basic.log('creating edit histogram %s' % self.lang) f_out = basic.create_dir('results/ratio_histograms') df = pd.read_csv(self.db_path) df.page_id = df.page_id.astype(float) df = df.loc[df['linked_id'] != None] df.linked_id = df.linked_id.astype(float) df = self.drop_dups(df) basic.log('dropped %s duplicates' % len(df.set_index('page_id',drop=False).index.get_duplicates())) df = df.drop_duplicates(subset='page_id',keep=False) if self.drop1: df = df.loc[(df['len'] > 1)] for r in self.revert: basic.log('%s %s' % (self.lang,r)) basic.log('%s pages' % len(df)) n0 = df.loc[(df['namespace'] == 0)].set_index('page_id',drop=False) n1 = df.loc[(df['namespace'] == 1)].set_index('linked_id',drop=False) basic.log('%s articles' % len(n0)) basic.log('%s talk' % len(n1)) ratio = n0[r].divide(n1[r],axis='index',fill_value=-1).to_frame() ratio.columns = ['ratio'] ratio.ratio = ratio.ratio.astype(int) ratio = n0.join(ratio).set_index('page_id') ratio = ratio.loc[ratio['ratio'] >= 0] basic.log('%s ratios' % len(ratio)) result = ratio['ratio'].value_counts().to_frame() result = result.sort_index(ascending=True) result.columns = ['pages'] result.to_csv('%s/%s_%s.csv' % (f_out,self.lang,r),encoding='utf-8',index_label='edit_ratio')
def edit_quantiles(self,q=.01,quantile_range=False,v=False,write=True): basic.log('creating edit quantiles %s' % self.lang) f_out = basic.create_dir('results/quantiles') df = pd.read_csv(self.db_path) df = self.drop_dups(df) df.page_id = df.page_id.astype(int) if self.drop1: df = df.loc[(df['len'] > 1)] q = np.arange(q,1+q,q) results = defaultdict(dict) for n in self.namespace: results[n] = defaultdict(dict) for r in self.revert: basic.log('%s %s %s' % (self.lang,n,r)) if n == 'at': result = df[r].quantile(q=q) mean = df[r].mean() else: result = df.loc[(df['namespace'] == self.namespace.index(n)),r].quantile(q=q) #qcut = pd.qcut(df.loc[(df['namespace'] == self.namespace.index(n)),r],q) #print(qcut) mean = df.loc[(df['namespace'] == self.namespace.index(n)),r].mean() result = result.to_frame() column = '%s_%s_%s' % (self.lang,n,r) result.columns = [column] results[n][r] = {'quantiles':result,'mean':mean} if write: result = result.append(DataFrame({column:result.loc[(result[column] < int(mean+1))].tail(1).index.values},index=['mean_quantile'])) result = result.append(DataFrame({column:mean},index=['mean_value'])) result.to_csv('%s/%s_%s_%s.csv' % (f_out,self.lang,n,r),encoding='utf-8',index_label='qauntiles') return results
def edit_histogram(self,plot=True,v=False): basic.log('creating edit histogram %s' % self.lang) f_out = basic.create_dir('results/histograms') df = pd.read_csv(self.db_path) df = self.drop_dups(df) if self.drop1: df = df.loc[(df['len'] > 1)] for n in self.namespace: for r in self.revert: basic.log('%s %s %s' % (self.lang,n,r)) if n == 'at': result = df[r].value_counts() else: result = df.loc[(df['namespace'] == self.namespace.index(n)),r].value_counts() result = result.sort_index(ascending=True) result.columns = ['articles'] result.to_csv('%s/%s_%s_%s.csv' % (f_out,self.lang,n,r),encoding='utf-8',index_label='edits')
def edit_statistics(self,statistics,v=False): f_out = basic.create_dir('results/basic_stats') if self.drop1: f = open('%s/edits_drop1_%s.csv' % (f_out,self.lang),'w') else: f = open('%s/edits_%s.csv' % (f_out,self.lang),'w') header = '"lang"' for n in self.namespace: for r in self.revert: for s in statistics: header = header + ((',"%s_%s_%s"') % (n,s,r)) header = header + '\n' f.write(header) result = defaultdict(dict) f.write('"%s"' % self.lang) result[self.lang] = defaultdict(dict) df = pd.read_csv(self.db_path) df = self.drop_dups(df) if self.drop1: df = df.loc[(df['len'] > 1)] for n in self.namespace: result[self.lang][n] = defaultdict(dict) for r in self.revert: result[self.lang][n][r] = defaultdict(dict) basic.log('%s %s %s' % (self.lang,n,r)) for s in statistics: if s == 'total': if n == 'at': result[self.lang][n][r][s] = df[r].sum() else: result[self.lang][n][r][s] = df.loc[(df['namespace'] == self.namespace.index(n)),r].sum() elif s == 'var': if n == 'at': result[self.lang][n][r][s] = df[r].var() else: result[self.lang][n][r][s] = df.loc[(df['namespace'] == self.namespace.index(n)),r].var() elif s == 'std': if n == 'at': result[self.lang][n][r][s] = df[r].std() else: result[self.lang][n][r][s] = df.loc[(df['namespace'] == self.namespace.index(n)),r].std() elif s == 'mean': if n == 'at': result[self.lang][n][r][s] = df[r].mean() else: result[self.lang][n][r][s] = df.loc[(df['namespace'] == self.namespace.index(n)),r].mean() elif s == 'median': if n == 'at': result[self.lang][n][r][s] = df[r].median() else: result[self.lang][n][r][s] = df.loc[(df['namespace'] == self.namespace.index(n)),r].median() elif s == 'total_ratio': if n == 't': result[self.lang][n][r][s] = float(result[self.lang]['a'][r]['total'])/result[self.lang]['t'][r]['total'] elif s == 'mean_ratio': if self.namespace.index(n) == (len(self.namespace)-1): result[self.lang][n][r][s] = float(result[self.lang]['a'][r]['mean'])/result[self.lang]['t'][r]['mean'] elif s == 'missing_talk': if self.namespace.index(n) == (len(self.namespace)-1): result[self.lang][n][r][s] = len(df.loc[(df['linked_id'] == 'NONE')]) f.write(',%s' % result[self.lang][n][r][s]) f.write('\n') f.close() return result