def train(db_path, out_path, **kwargs): db = sqlite3.connect(db_path) db.create_function("LC", 1, linecount) # auto-detect whether it's a GitHub repo kwargs['gh'] = dbutil.is_github(db) ret = create_corpus(db, out_path, **kwargs) if ret: sys.exit(ret)
def explore(db_path, graph=False): locale.setlocale(locale.LC_ALL, 'en_GB.utf-8') db = sqlite3.connect(db_path) if dbutil.is_github(db): db.close() explore_gh(db_path) return if graph and not os.path.exists(img_dir): os.makedirs(img_dir) # Worker process pool pool, jobs = Pool(processes=4), [] if graph: jobs.append(pool.apply_async(graph_ocl_lc, (db_path,))) # TODO: If GH dataset: # jobs.append(pool.apply_async(graph_ocl_stars, (db_path,))) future_stats = pool.apply_async(stats_worker, (db_path,)) # Wait for jobs to finish [job.wait() for job in jobs] # Print stats print() stats = future_stats.get() maxlen = max([len(x[0]) for x in stats]) for stat in stats: k,v = stat if k: print(k, ':', ' ' * (maxlen - len(k) + 2), v, sep='') elif v == '': print(k) else: print()
def test_is_github(self): self.assertFalse(dbutil.is_github(tests.db('empty'))) self.assertTrue(dbutil.is_github(tests.db('empty-gh')))