예제 #1
0
def splitfile(file_name, file_size):
    num_files = ceil(float(file_size)/float(MAX_FILE_SIZE))
    print file_size, MAX_FILE_SIZE, num_files
    
    fsp = FileSplitter()
    fsp.parseOptions(["-i", file_name, "-n", num_files, "-s"])
    fsp.do_work()
    return num_files
class TestFileSplitter(TestCase):

    def setUp(self):
        self.valid_phrases = ("public", "admin")
        self.django_po_file = "%s/django.po" % os.path.dirname(__file__)
        self.file_splitter = FileSplitter(self.django_po_file, self.valid_phrases)

    def test_that_we_can_get_the_first_line(self):
        first_line = self.file_splitter.get_line(0)
        self.assertEqual(first_line, "#: first public line\n")

    def test_that_first_line_is_a_comment(self):
        first_line = self.file_splitter.get_line(0)
        self.assertTrue(self.file_splitter.is_comment(first_line))

    def test_that_the_third_line_is_not_a_comment(self):
        third_line = self.file_splitter.get_line(2)
        self.assertFalse(self.file_splitter.is_comment(third_line))

    def test_that_comment_is_invalid(self):
        comment = "invalid comment"
        self.assertFalse(self.file_splitter.is_comment_valid(comment))

    def test_that_the_comment_is_valid(self):
        comment = "valid comment which contains the word public"
        self.assertTrue(self.file_splitter.is_comment_valid(comment))

    def test_that_we_can_pick_lines_below_a_given_comment(self):
        lines = self.file_splitter.get_lines_below_comment(0)
        self.assertEqual(len(lines), 5)

    def test_that_we_can_pick_lines_below_a_comment_that_contains_admin(self):
        pass
예제 #3
0
    def trajectories_knn(self):
        start_pos, end_pos, paths = FileSplitter.points_old()
        knn_start = pysal.weights.KNN(start_pos, k=NUM_GROUPS)
        knn_end = pysal.weights.KNN(end_pos, k=NUM_GROUPS)

        start_groups = []
        end_groups = []

        for n in knn_start.neighbors:
            start_group = []
            for i in knn_start.neighbors[n]:
                start_group.append(start_pos[i])
            start_groups.append(start_group)

        for n in knn_end.neighbors:
            end_group = []
            for i in knn_end.neighbors[n]:
                end_group.append(end_pos[i])
            end_groups.append(end_group)

        bboxs_start = []
        bboxs_end = []

        for g in start_groups:
            c = Chain(g)
            bboxs_start.append(c.bounding_box)

        for g in end_groups:
            c = Chain(g)
            bboxs_end.append(c.bounding_box)

        self.plot_on_bokeh(start_pos, end_pos, bboxs_start, bboxs_end)
예제 #4
0
def topic_model(url):
    page_id_and_access_token = get_access_token_page_id(url)
    page_id = page_id_and_access_token[0]
    create_directory_for_topic_model(page_id)
    FileSplitter(page_id).split()
    lda_train = runR(page_id)
    lda_train.convert_to_matrix()
    lda_train.train_lda()
예제 #5
0
    def neighbors_plot(self):
        import gc
        from numpy import histogram
        import numpy as np
        from sklearn.neighbors import radius_neighbors_graph

        start_pos, end_pos, paths = FileSplitter.points()
        del start_pos, end_pos
        gc.collect()
        neighbors = radius_neighbors_graph(paths, radius=0.005)
        del paths
        gc.collect()
        neighbors = neighbors.toarray()
        x = np.matrix(neighbors)
        x = x.sum(axis=1)
        counts = [d[0, 0] for d in x]
        hist, edges = histogram(counts, bins=10, density=False)
        self.plot_on_bokeh_hist('neighbors_hist.html', '# of Neighbors',
                                '# of Occurrance', 'Neighbors Within Radius',
                                hist, edges)
        pass
예제 #6
0
    def trajectories_hdbscan(self, min_cluster_size):
        def centroids(paths):
            # distances = euclidean_distances(paths)
            # distances = cdist(paths, paths, 'euclidean')
            clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size)
            cluster_labels = clusterer.fit_predict(paths)
            num_clusters = len(
                set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
            unique_labels = set(cluster_labels)
            clusters = [[] for n in range(num_clusters)]
            logging.info('Number of clusters: %s', num_clusters)
            for i, v in enumerate(paths):
                if cluster_labels[i] != -1:
                    clusters[cluster_labels[i]].append(v)
            return clusters

        start_pos, end_pos, paths = FileSplitter.points()
        clusters = centroids(
            paths)  # Array of [start_lat, start_lon, end_lat, end_lon]
        gc = self.createGeometry(clusters)
        self.createJsonFile(gc)
예제 #7
0
import sys
from FileSplitter import FileSplitter

if len(sys.argv) < 2:
    print "django.po file is missing"
    exit()

django_po_file = sys.argv[1]
valid_phrases = ("admin", "noneadmin")

ACCEPTED_PAGES = ("ureport/home.html",
                  "ureport_layout.html",
                  "ureport/partials/viz/",
                  "ureport/partials/tag_cloud/",
                  "ureport/about.html",
                  "ureport/how_to_join.html",
                  "ureport/national_pulse.html",
                  "ureport/poll_summary.html")

splitter = FileSplitter(django_po_file, ACCEPTED_PAGES)
splitter.split()


 def setUp(self):
     self.valid_phrases = ("public", "admin")
     self.django_po_file = "%s/django.po" % os.path.dirname(__file__)
     self.file_splitter = FileSplitter(self.django_po_file, self.valid_phrases)
예제 #9
0
training = ConfigMap("Training")
eps = float(training['eps'])
grpsize = int(training['grpsize'])
MAX_LINES = int(training['size'])
sourcedir = training['sourcedir']
sourceregex = training['sourceregex']

secret = ConfigMap("Secrets")
matric = 'euclidean'
NUM_GROUPS = 70
API_KEY = secret['google_maps_api_key']
logging = Logging("trajectory")
system = ConfigMap("System")
cores = int(system['cores'])

filesplitter = FileSplitter()


class Trajectory:
    def __init__(self):
        logging.info("start")

    def get_tree(self, pts):
        tree = pysal.cg.kdtree.KDTree(pts,
                                      leafsize=10,
                                      distance_metric='Euclidean',
                                      radius=6371.0)
        return tree

    def plot_on_bokeh(self, starts, ends, bboxes_start, bboxes_end):
        from bokeh.io import output_file, show
예제 #10
0
import sys
from FileSplitter import FileSplitter

if len(sys.argv) < 2:
    print "django.po file is missing"
    exit()

django_po_file = sys.argv[1]
valid_phrases = ("admin", "noneadmin")

ACCEPTED_PAGES = ("ureport/home.html", "ureport_layout.html",
                  "ureport/partials/viz/", "ureport/partials/tag_cloud/",
                  "ureport/about.html", "ureport/how_to_join.html",
                  "ureport/national_pulse.html", "ureport/poll_summary.html")

splitter = FileSplitter(django_po_file, ACCEPTED_PAGES)
splitter.split()