def splitfile(file_name, file_size): num_files = ceil(float(file_size)/float(MAX_FILE_SIZE)) print file_size, MAX_FILE_SIZE, num_files fsp = FileSplitter() fsp.parseOptions(["-i", file_name, "-n", num_files, "-s"]) fsp.do_work() return num_files
class TestFileSplitter(TestCase): def setUp(self): self.valid_phrases = ("public", "admin") self.django_po_file = "%s/django.po" % os.path.dirname(__file__) self.file_splitter = FileSplitter(self.django_po_file, self.valid_phrases) def test_that_we_can_get_the_first_line(self): first_line = self.file_splitter.get_line(0) self.assertEqual(first_line, "#: first public line\n") def test_that_first_line_is_a_comment(self): first_line = self.file_splitter.get_line(0) self.assertTrue(self.file_splitter.is_comment(first_line)) def test_that_the_third_line_is_not_a_comment(self): third_line = self.file_splitter.get_line(2) self.assertFalse(self.file_splitter.is_comment(third_line)) def test_that_comment_is_invalid(self): comment = "invalid comment" self.assertFalse(self.file_splitter.is_comment_valid(comment)) def test_that_the_comment_is_valid(self): comment = "valid comment which contains the word public" self.assertTrue(self.file_splitter.is_comment_valid(comment)) def test_that_we_can_pick_lines_below_a_given_comment(self): lines = self.file_splitter.get_lines_below_comment(0) self.assertEqual(len(lines), 5) def test_that_we_can_pick_lines_below_a_comment_that_contains_admin(self): pass
def trajectories_knn(self): start_pos, end_pos, paths = FileSplitter.points_old() knn_start = pysal.weights.KNN(start_pos, k=NUM_GROUPS) knn_end = pysal.weights.KNN(end_pos, k=NUM_GROUPS) start_groups = [] end_groups = [] for n in knn_start.neighbors: start_group = [] for i in knn_start.neighbors[n]: start_group.append(start_pos[i]) start_groups.append(start_group) for n in knn_end.neighbors: end_group = [] for i in knn_end.neighbors[n]: end_group.append(end_pos[i]) end_groups.append(end_group) bboxs_start = [] bboxs_end = [] for g in start_groups: c = Chain(g) bboxs_start.append(c.bounding_box) for g in end_groups: c = Chain(g) bboxs_end.append(c.bounding_box) self.plot_on_bokeh(start_pos, end_pos, bboxs_start, bboxs_end)
def topic_model(url): page_id_and_access_token = get_access_token_page_id(url) page_id = page_id_and_access_token[0] create_directory_for_topic_model(page_id) FileSplitter(page_id).split() lda_train = runR(page_id) lda_train.convert_to_matrix() lda_train.train_lda()
def neighbors_plot(self): import gc from numpy import histogram import numpy as np from sklearn.neighbors import radius_neighbors_graph start_pos, end_pos, paths = FileSplitter.points() del start_pos, end_pos gc.collect() neighbors = radius_neighbors_graph(paths, radius=0.005) del paths gc.collect() neighbors = neighbors.toarray() x = np.matrix(neighbors) x = x.sum(axis=1) counts = [d[0, 0] for d in x] hist, edges = histogram(counts, bins=10, density=False) self.plot_on_bokeh_hist('neighbors_hist.html', '# of Neighbors', '# of Occurrance', 'Neighbors Within Radius', hist, edges) pass
def trajectories_hdbscan(self, min_cluster_size): def centroids(paths): # distances = euclidean_distances(paths) # distances = cdist(paths, paths, 'euclidean') clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size) cluster_labels = clusterer.fit_predict(paths) num_clusters = len( set(cluster_labels)) - (1 if -1 in cluster_labels else 0) unique_labels = set(cluster_labels) clusters = [[] for n in range(num_clusters)] logging.info('Number of clusters: %s', num_clusters) for i, v in enumerate(paths): if cluster_labels[i] != -1: clusters[cluster_labels[i]].append(v) return clusters start_pos, end_pos, paths = FileSplitter.points() clusters = centroids( paths) # Array of [start_lat, start_lon, end_lat, end_lon] gc = self.createGeometry(clusters) self.createJsonFile(gc)
import sys from FileSplitter import FileSplitter if len(sys.argv) < 2: print "django.po file is missing" exit() django_po_file = sys.argv[1] valid_phrases = ("admin", "noneadmin") ACCEPTED_PAGES = ("ureport/home.html", "ureport_layout.html", "ureport/partials/viz/", "ureport/partials/tag_cloud/", "ureport/about.html", "ureport/how_to_join.html", "ureport/national_pulse.html", "ureport/poll_summary.html") splitter = FileSplitter(django_po_file, ACCEPTED_PAGES) splitter.split()
def setUp(self): self.valid_phrases = ("public", "admin") self.django_po_file = "%s/django.po" % os.path.dirname(__file__) self.file_splitter = FileSplitter(self.django_po_file, self.valid_phrases)
training = ConfigMap("Training") eps = float(training['eps']) grpsize = int(training['grpsize']) MAX_LINES = int(training['size']) sourcedir = training['sourcedir'] sourceregex = training['sourceregex'] secret = ConfigMap("Secrets") matric = 'euclidean' NUM_GROUPS = 70 API_KEY = secret['google_maps_api_key'] logging = Logging("trajectory") system = ConfigMap("System") cores = int(system['cores']) filesplitter = FileSplitter() class Trajectory: def __init__(self): logging.info("start") def get_tree(self, pts): tree = pysal.cg.kdtree.KDTree(pts, leafsize=10, distance_metric='Euclidean', radius=6371.0) return tree def plot_on_bokeh(self, starts, ends, bboxes_start, bboxes_end): from bokeh.io import output_file, show