def test_base_36_conversion(self): self.assertEqual(helpers.convert_numeric_id_to_id36(295), '87') self.assertEqual(helpers.convert_id36_to_numeric_id('87'), 295) self.assertEqual(helpers.convert_numeric_id_to_id36(275492), '5wkk') self.assertEqual(helpers.convert_id36_to_numeric_id('5wkk'), 275492) self.assertRaises(TypeError, helpers.convert_numeric_id_to_id36) self.assertRaises(ValueError, helpers.convert_numeric_id_to_id36, '1') self.assertRaises(ValueError, helpers.convert_numeric_id_to_id36, -1) self.assertRaises(TypeError, helpers.convert_id36_to_numeric_id) self.assertRaises(ValueError, helpers.convert_id36_to_numeric_id, 't3_87') self.assertRaises(ValueError, helpers.convert_id36_to_numeric_id, 87)
df.dropna(inplace=True) df["created_utc"] = df.created.apply( lambda x: datetime.utcfromtimestamp(x)) df["crawled_utc"] = df.crawled.apply( lambda x: datetime.utcfromtimestamp(x)) print "crawl started:", df.crawled_utc.min() print "crawl ended:", df.crawled_utc.max() print "crawl lasted for:", df.crawled_utc.max() - df.crawled_utc.min() print "comment ids collected:", len(df) print "avg comment rate: %2.4f comments per second" % ( len(df) / (df.crawled_utc.max() - df.crawled_utc.min()).total_seconds()) # plt.show() # print df.id.apply(lambda x: convert_id36_to_numeric_id(str(x[3:]))) df["num_ids"] = df.id.apply( lambda x: convert_id36_to_numeric_id(str(x[3:]))) num_ids = df["num_ids"] # print df.describe() df.set_index("num_ids", inplace=True) df["crawl_lag"] = df.crawled - df.created ax = df.crawl_lag.plot(kind="area", zorder=300, alpha=.8, lw=0) # plt.savefig("crawl lag in seconds.pdf") # print "missing ids:", num_ids.max()-num_ids.min() - len(num_ids), "/", len(num_ids) missing_ids = pd.Series( sorted(set(range(num_ids.min(), num_ids.max() + 1)) - set(num_ids)) ) #.apply(lambda x: u"t1_"+convert_numeric_id_to_id36(int(x))) # print df.index.min() for i in missing_ids: plt.axvline(i, color='k', alpha=.2, zorder=0) # pd.DataFrame(data = zip(missing_ids.values, np.ones_like(missing_ids.values))).plot(ls="^")