Example #1
0
    def kmenas(self, k):
        j = 0
        centeroids = []
        for i in range(k):
            centeroids.append(self.vectors[i])
        # centeroids = [self.vectors[0], self.vectors[1]]
        iterations = 5
        # k = len(centeroids)
        for iterate in range(iterations):
            update_progress(iterate, iterations)
            cluster_members = [[] for u in centeroids]

            for v in self.vectors:
                cluster_members[self.belongs_to_cluster(v, centeroids)].append(v)

                # for clnum in range(k):
                #     print('members of cluster', clnum, ' = ', cluster_members[clnum])
                #     print(len(cluster_members[clnum]))

            for i in range(k):
                centeroids[i] = self.average_of_vectors(cluster_members[i])
                labels = [self.similarity(u, centeroids[i]) for u in cluster_members[i]]
                # print("labels for cluster ", i, ": ", cluster_members[i][labels.index(max(labels))])

                for temp in cluster_members[i]:
                    j += sum(map(lambda x: (x[0] - x[1]) ** 2, zip(temp, centeroids[i])))

        return j
Example #2
0
def index(elastic, dir='./resources/jsonFiles'):


    path_to_json = dir
    json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]
    for i, js in enumerate(json_files):

        with open(os.path.join(path_to_json, js)) as json_file:
            data = json.load(json_file)
            a = elastic.index(index='researchgate',  doc_type='articles', id=i,  body=data)

        update_progress(i, json_files.__len__())
Example #3
0
    def crawl(self):
        for start_link in self.starting_urls:
            downloader = Downloader(start_link)
            app = downloader.get_app_from_link()

            self.links_visited.add(start_link)

            self.apps.append(app)
            self.depth_links.append(app.in_links)
            self.depth_links.append(app.out_links)
            with open(str('./resources/jsonFiles/' + 'item_pipeline_' + 0 + '_' + app.uid + '.json'), 'w') as outfile:
                json.dump(app.__dict__, outfile)
            self.num_docs_crawled = 1

        while self.num_docs_crawled < self.num_docs_to_be_crawled:
            current_in_links = []
            current_out_links = []

            count = 0
            for link in self.depth_links[self.current_depth]:


                if link not in self.links_visited and count < self.in_degree :
                    current_app = Downloader(link).get_app_from_link()
                    if current_app is 0:
                        continue
                    current_in_links.extend(current_app.in_links)
                    current_out_links.extend(current_app.out_links)
                    with open(str('./resources/jsonFiles/' + 'item_pipeline_' + str(self.num_docs_crawled) + '_' + current_app.uid + '.json'), 'w') as outfile:
                        json.dump(current_app.__dict__, outfile)
                    update_progress(self.num_docs_crawled, self.num_docs_to_be_crawled)
                    self.num_docs_crawled += 1
                    self.apps.append(current_app)
                    self.links_visited.add(link)
                    count += 1



            self.depth_links.append(current_in_links)
            self.depth_links.append(current_out_links)
            self.current_depth += 1

            current_in_links = []
            current_out_links = []

            count = 0
            for link in self.depth_links[self.current_depth]:
                if link not in self.links_visited and count < self.out_degree:
                    current_app = Downloader(link).get_app_from_link()
                    if current_app is 0:
                        continue
                    current_in_links.extend(current_app.in_links)
                    current_out_links.extend(current_app.out_links)
                    with open(str('./resources/jsonFiles/' + 'item_pipeline_' + str(self.num_docs_crawled) + '_' + current_app.uid + '.json'), 'w') as outfile:
                        json.dump(current_app.__dict__, outfile)
                    update_progress(self.num_docs_crawled, self.num_docs_to_be_crawled)
                    self.num_docs_crawled += 1
                    self.apps.append(current_app)
                    self.links_visited.add(link)
                    count += 1


            self.current_depth += 1
            self.depth_links.append(current_in_links)
            self.depth_links.append(current_out_links)