def __init__(self): self.crawled_urls = set() self.deny_url_contains = [ "lib.", "library", "taxonomy", "image", "repo" ] self.database = get_db() index_store = self.database.get_collection('index_store') urls = index_store.find({}, {"_id": 0, "score": 0, "added_date": 0}) for url in urls: self.crawled_urls.add(url['url'])
def __init__(self, *args, **kwargs): super(UniversitySpider, self).__init__() if 'classifier' in kwargs: self.classifier = kwargs.get('classifier') print(self.classifier.name, '\n') self.database = get_db() index_store = self.database.get_collection('index_store') urls = index_store.find({}, {"_id": 0, "score": 0, "added_date":0}) for url in urls: self.crawled_urls.add(url['url']) print("Started " + self.name)
def __init__(self, *args, **kwargs): super(LecturerFindSpider, self).__init__() if 'subject_name' in kwargs: self.subject_name = kwargs.get('subject_name') print(self.subject_name) self.database = get_db() index_store = self.database.get_collection('index_store') urls = index_store.find({'score': 1}, {"_id": 0, "added_date": 0}) self.onto = ITOntologyManager() for url in urls: self.start_urls.append(url['url']) else: raise Exception("Subject Name shoud be supported to continue.")
def __init__(self): self.processed_items = set() database = get_db() self.matched_profiles_collectiion = database.get_collection( 'matched_profiles')
def __init__(self): database = get_db() self.index_store = database.get_collection('index_store')
from flask import Flask, jsonify, request, json from flask_cors import CORS from bson import json_util from start_crawler import start_reactor import managedb app = Flask(__name__) cors = CORS(app, headers=['Content-Type']) database = managedb.get_db() @app.route('/') def get_root(): resutl = database['index_store'].find_one() return "App is working " + resutl['name'] @app.route('/api/urls', methods=['GET']) def get_urls_list(): return ["Names"] if __name__ == '__main__': app.run()