Ejemplo n.º 1
0
 def __init__(self):
     self.crawled_urls = set()
     self.deny_url_contains = [
         "lib.", "library", "taxonomy", "image", "repo"
     ]
     self.database = get_db()
     index_store = self.database.get_collection('index_store')
     urls = index_store.find({}, {"_id": 0, "score": 0, "added_date": 0})
     for url in urls:
         self.crawled_urls.add(url['url'])
Ejemplo n.º 2
0
    def __init__(self, *args, **kwargs):
        super(UniversitySpider, self).__init__()

        if 'classifier' in kwargs:
            self.classifier = kwargs.get('classifier')
            print(self.classifier.name, '\n')

        self.database = get_db()
        index_store = self.database.get_collection('index_store')
        urls = index_store.find({}, {"_id": 0, "score": 0, "added_date":0})
        for url in urls:
            self.crawled_urls.add(url['url'])

        print("Started " + self.name)
Ejemplo n.º 3
0
    def __init__(self, *args, **kwargs):
        super(LecturerFindSpider, self).__init__()

        if 'subject_name' in kwargs:
            self.subject_name = kwargs.get('subject_name')
            print(self.subject_name)

            self.database = get_db()
            index_store = self.database.get_collection('index_store')
            urls = index_store.find({'score': 1}, {"_id": 0, "added_date": 0})

            self.onto = ITOntologyManager()

            for url in urls:
                self.start_urls.append(url['url'])

        else:
            raise Exception("Subject Name shoud be supported to continue.")
Ejemplo n.º 4
0
 def __init__(self):
     self.processed_items = set()
     database = get_db()
     self.matched_profiles_collectiion = database.get_collection(
         'matched_profiles')
Ejemplo n.º 5
0
 def __init__(self):
     database = get_db()
     self.index_store = database.get_collection('index_store')
Ejemplo n.º 6
0
from flask import Flask, jsonify, request, json
from flask_cors import CORS
from bson import json_util
from start_crawler import start_reactor

import managedb

app = Flask(__name__)
cors = CORS(app, headers=['Content-Type'])
database = managedb.get_db()

@app.route('/')
def get_root():
    resutl = database['index_store'].find_one()
    return "App is working " + resutl['name']


@app.route('/api/urls', methods=['GET'])
def get_urls_list():

    return ["Names"]


if __name__ == '__main__':
    app.run()