コード例 #1
0
    def get(self):
        db_init = DBConnection()
        job_id = request.args.get('id', None)
        output = []
        search_query = {}
        if job_id:
            search_query["job_id"] = "job_id"

        result = db_init.get_job({'job_id': job_id})
        date_time_format = '%Y-%m-%d %H:%M:%S'
        for item in result:
            updated_time = item["updated_time"]
            start_time = item["start_time"]
            downloaded_size = item["downloaded_size"]
            total_file_size = item["total_file_size"]
            estimated_time = 0
            if downloaded_size and total_file_size:
                print(updated_time, start_time)
                diff_sec = date_diff_in_s(updated_time, start_time)
                estimated_time = (
                    (float(total_file_size) - float(downloaded_size)) /
                    float(downloaded_size)) * diff_sec
                db_init.close()
            temp = dict(item)
            temp["estimated_time_seconds"] = round(estimated_time, 2)
            return jsonify(temp)
コード例 #2
0
def download_file(job_id, url, filename, already_processed):
    print(job_id, url, filename, "xxxxxxxxxxxxx")
    db_init = DBConnection()
    file_mode = 'wb' if already_processed == 0 else 'ab'
    response = requests.get(url, stream=True)
    total = response.headers.get('content-length')
    content_type = response.headers.get('Content-Type')
    file_extension = mimetypes.guess_extension(content_type)
    write_file_path = filename + file_extension
    is_break = False
    with open(write_file_path, file_mode) as f:
        if total is None:
            f.write(response.content)
        else:
            downloaded = 0
            total = int(total)

            for data in response.iter_content(
                    chunk_size=max(int(total / 1000), 1024 * 1024)):
                downloaded += len(data)
                update_query = {}
                update_query["job_id"] = job_id

                result = db_init.get_job({'job_id': job_id})
                if result:
                    status = result[0]['status']
                    if status in ["PAUSE", "STOP"]:
                        update_query["status"] = status
                        is_break = True

                if downloaded > already_processed:
                    already_processed = 0
                else:
                    continue

                if is_break:
                    db_init.update_job(update_query)
                    break

                f.write(data)
                done = int(50 * downloaded / total)
                print(
                    total,
                    downloaded,
                )

                update_query["total_file_size"] = total
                update_query["downloaded_size"] = downloaded
                update_query["remaining_size"] = total - downloaded
                db_init.update_job(update_query)

    if not is_break:
        update_query = {}
        update_query["job_id"] = job_id
        update_query["end_time"] = datetime.utcnow()
        update_query["status"] = 'COMPLETED'
        update_query["command"] = "Finished Download"

    db_init.close()
コード例 #3
0
class Worker(object):
    worker_id = ""

    def do_the_job(self, data):
        job_id = data["job_id"]
        status = data["status"]
        self.db_init = DBConnection()
        
        search_query={}
        search_query["job_id"]=job_id
        result=self.db_init.get_job({'job_id':job_id})
        if result:
            data=result[0]
            url=data["input_url"]
            already_processed=data["downloaded_size"] if status=="RESUME" else 0
            download_file(self.db_init,job_id,url,already_processed)
            
        self.db_init.close()

    def __init__(self,):
        self.queue_name = 'urls'
        self.exchange_name = 'info'
        self.host = settings.RABBITMQ_HOST
        self.user = settings.RABBITMQ_USER
        self.password = settings.RABBITMQ_PASS

        self.credentials = pika.PlainCredentials(self.user, self.password)

        self.connection = pika.BlockingConnection(pika.ConnectionParameters(host=self.host, port=5672,
                                                                            credentials=self.credentials))

        self.channel = self.connection.channel()
        self.channel.queue_declare(queue=self.queue_name)


    def callback(self, ch, method, properties, body):

        if body is not None or body != '':
            data = json.loads(body.decode())
            if 'job_id' in data and 'status' in data:
                try:
                    self.do_the_job(data)
                except Exception as e:
                    print(str(e))

            ch.basic_ack(delivery_tag=method.delivery_tag)
            print("done with the job by worker - ", self.worker_id)
コード例 #4
0
    def post(self):

        db_init = DBConnection()
        job_id = str(uuid.uuid1())

        url = request.form['url'] if 'url' in request.form else None

        input_dict = {}
        input_dict["job_id"] = job_id
        input_dict["status"] = "SCHEDULED"
        input_dict["start_time"] = datetime.utcnow()
        input_dict["output_path"] = job_id

        input_dict["command"] = "SCHEDULED Download"
        is_exist = False

        if not url:
            input_dict["command"] = "no input url found Job"
            input_dict["end_time"] = datetime.utcnow()
            is_exist = True
        else:
            input_dict["input_url"] = url

        db_init.insert_job(input_dict)
        if is_exist:
            db_init.close()
            return jsonify(input_dict)

        message_to_publish = {}
        message_to_publish['job_id'] = job_id
        message_to_publish['status'] = 'SCHEDULED'
        message_publisher(message_to_publish)

        update_query = {}
        update_query["job_id"] = job_id

        db_init.close()

        return jsonify(update_query)
コード例 #5
0
 def post(self):
     db_init = DBConnection()
     f = request.files['file_name'] if 'file_name' in request.files else None
     if f:
         path = settings.storage_path
         file_ = f.filename
         job_id = str(uuid.uuid1())
         filename, file_extension = os.path.splitext(file_)
         full_path = os.path.join(path, job_id + file_extension)
         f.save(secure_filename(full_path))
         input_dict = {}
         input_dict["job_id"] = job_id
         input_dict["status"] = "COMPLETED"
         input_dict["start_time"] = datetime.utcnow()
         input_dict["output_path"] = full_path
         input_dict["command"] = ""
         input_dict["end_time"] = datetime.utcnow()
         db_init.insert_job(input_dict)
         db_init.close()
         return jsonify(input_dict)
     else:
         db_init.close()
         return "No file found"
コード例 #6
0
    def get(self):
        db_init = DBConnection()
        job_id = request.args.get('id', None)
        state = request.args.get('state', None)
        update_query = {}

        update_query["status"] = state
        update_query["job_id"] = job_id
        if state in ['PAUSE', 'STOP', 'RESUME']:

            db_init.update_job(update_query)
            update_query["message"] = 'updated download status'

            if state == 'RESUME':
                message_to_publish = {}
                message_to_publish['job_id'] = job_id
                message_to_publish['status'] = 'RESUME'
                message_publisher(message_to_publish)
        else:
            update_query["message"] = "no contol found"

        db_init.close()

        return jsonify(update_query)
コード例 #7
0
ファイル: index.py プロジェクト: gauravtech4u/easy-search
class IndexData(object):

    def __init__(self, file_name="local_config"):
        self.resultset = []
        settings = __import__("%s" % file_name)
        self.is_threading = settings.IMPLEMENT_THREADED_SEARCH
        db_config = settings.DATABASE
        self.index_classes = settings.INDEX_CLASSES
        self.is_indexing = settings.INDEXING
        self.intervals = settings.INTERVALS
        self.bucket_intervals = settings.BUCKET_INTERVAL
        self.word_split_pattern = settings.WORD_SPLIT_PATTERN
        self.conn = DBConnection(db_config['HOST'], db_config['USER'], db_config['PASSWORD'], db_config['NAME'],
                                 db_config['PORT'], settings.FILE_PATH)

    def split_sentence(self, raw_sentence):
        word_list = resplit(self.word_split_pattern, raw_sentence)
        return word_list

    def create_hash(self, sentence):
        word_list = self.split_sentence(sentence)
        weight_list = Commands.assign_weight(word_list)
        return weight_list

    def false_index(self, data_count, table_name, field_list):
        dump_file_counts = int(data_count/self.intervals) + 1
        start, offset = 0, self.intervals
        if not self.is_threading:
            offset = data_count
            dump_file_counts = 1
        for file_no in range(dump_file_counts):
            self.conn.create_outfile(table_name, field_list, start, offset, file_no)
            start += self.intervals

    def true_index(self, data_count, table_name, field_list):
        interval_count = int(data_count/self.intervals) if data_count > self.intervals else 1
        start, offset = 0, self.intervals
        file_dict = {}
        pool = Pool(processes=2)
        args_list = []
        for i in range(interval_count):
            args_list.append([start, offset, table_name, field_list, file_dict],)
            start += offset
        pool.map(index_data, args_list)
        pool.close()
        pool.join()

    def index(self):
        for instance in self.index_classes:
            data_count = self.conn.get_table_counts(instance.table_name)
            if data_count:
                getattr(self, ("%s_index" % self.is_indexing).lower())(data_count, instance.table_name,
                                                                       instance.field_list)
            else:
                print "No Data to Index. Exiting...."
        self.conn.close()

    @classmethod
    def run(cls, field_list, table_name, bucket={}):
        self = IndexData("local_config")
        result_set = self.conn.get_all_records(field_list, table_name)
        for pos, data in result_set:
                word_list = self.create_hash(data)
                for word, weight in word_list:
                    bucket_no = Commands.assign_bucket(weight, self.bucket_intervals)
                    try:
                        bucket[bucket_no][word].append(pos)
                    except KeyError:
                        bucket[bucket_no][word] = [pos, ]
        return bucket