Ejemplo n.º 1
0
def insertUrlList( db : couchdb.Database, urlList ):
    logging.info( "inserting url list..." )
    for url in urlList: 
        data = { '_id' : url,
                 'type' : 'url', 
                 'visited' : False }
        db.save( data )
Ejemplo n.º 2
0
class EdgeDataBridge(object):

    """Edge Bridge"""

    def __init__(self, config):
        super(EdgeDataBridge, self).__init__()
        self.config = config
        self.api_host = self.config_get('tenders_api_server')
        self.api_version = self.config_get('tenders_api_version')
        self.retrievers_params = self.config_get('retrievers_params')

        self.client = TendersClient(host_url=self.api_host,
            api_version=self.api_version, key=''
        )

        self.couch_url = urljoin(
            self.config_get('couch_url'),
            self.config_get('public_db')
        )
        self.db = Database(self.couch_url,
                           session=Session(retry_delays=range(10)))

    def config_get(self, name):
        return self.config.get('main').get(name)

    def get_teders_list(self):
        for item in get_tenders(host=self.api_host, version=self.api_version,
                                key='', extra_params={'mode': '_all_'},
                                retrievers_params=self.retrievers_params):
            yield (item["id"], item["dateModified"])

    def save_tender_in_db(self, tender_id, date_modified):
        tender_doc = self.db.get(tender_id)
        if tender_doc:
            if tender_doc['dateModified'] == date_modified:
                return
        tender = self.client.get_tender(tender_id).get('data')
        if tender:
            tender['_id'] = tender_id
            tender['doc_type'] = 'Tender'
            if tender_doc:
                tender['_rev'] = tender_doc['_rev']
                logger.info('Update tender {} '.format(tender_id))
            else:
                logger.info('Save tender {} '.format(tender_id))
            try:
                self.db.save(tender)
            except Exception as e:
                logger.info('Saving tender {} fail with error {}'.format(tender_id, e.message),
                    extra={'MESSAGE_ID': 'edge_bridge_fail_save_in_db'})
        else:
            logger.info('Tender {} not found'.format(tender_id))

    def run(self):
        logger.info('Start Edge Bridge',
                    extra={'MESSAGE_ID': 'edge_bridge_start_bridge'})
        logger.info('Start data sync...',
                    extra={'MESSAGE_ID': 'edge_bridge__data_sync'})
        for tender_id, date_modified in self.get_teders_list():
            self.save_tender_in_db(tender_id, date_modified)
Ejemplo n.º 3
0
class EdgeDataBridge(object):
    """Edge Bridge"""
    def __init__(self, config):
        super(EdgeDataBridge, self).__init__()
        self.config = config
        self.api_host = self.config_get('tenders_api_server')
        self.api_version = self.config_get('tenders_api_version')

        self.client = TendersClient(host_url=self.api_host,
                                    api_version=self.api_version,
                                    key='')

        self.couch_url = urljoin(self.config_get('couch_url'),
                                 self.config_get('public_db'))
        self.db = Database(self.couch_url,
                           session=Session(retry_delays=range(10)))

    def config_get(self, name):
        return self.config.get('main').get(name)

    def get_teders_list(self):
        for item in get_tenders(host=self.api_host,
                                version=self.api_version,
                                key='',
                                extra_params={'mode': '_all_'}):
            yield (item["id"], item["dateModified"])

    def save_tender_in_db(self, tender_id, date_modified):
        tender_doc = self.db.get(tender_id)
        if tender_doc:
            if tender_doc['dateModified'] == date_modified:
                return
        tender = self.client.get_tender(tender_id).get('data')
        if tender:
            tender['_id'] = tender_id
            tender['doc_type'] = 'Tender'
            if tender_doc:
                tender['_rev'] = tender_doc['_rev']
                logger.info('Update tender {} '.format(tender_id))
            else:
                logger.info('Save tender {} '.format(tender_id))
            try:
                self.db.save(tender)
            except Exception as e:
                logger.info(
                    'Saving tender {} fail with error {}'.format(
                        tender_id, e.message),
                    extra={'MESSAGE_ID': 'edge_bridge_fail_save_in_db'})
        else:
            logger.info('Tender {} not found'.format(tender_id))

    def run(self):
        logger.info('Start Edge Bridge',
                    extra={'MESSAGE_ID': 'edge_bridge_start_bridge'})
        logger.info('Start data sync...',
                    extra={'MESSAGE_ID': 'edge_bridge__data_sync'})
        for tender_id, date_modified in self.get_teders_list():
            self.save_tender_in_db(tender_id, date_modified)
Ejemplo n.º 4
0
class CouchdbOutput(OutputModule):
    def __init__(self,
                 actor_config,
                 couchdb_url,
                 payload=None,
                 selection="data",
                 parallel_streams=1,
                 native_events=False,
                 **kw):
        OutputModule.__init__(self, actor_config)
        self.pool.createQueue("inbox")
        self.registerConsumer(self.consume, "inbox")
        self.couchdb = Database(couchdb_url)

    def consume(self, event):
        if event.isBulk():
            bulk_docs = {}
            for e in extractBulkItems(event):
                doc = e.get(self.kwargs.selection)
                doc_id = doc.pop('id', doc.pop('_id', ''))
                if doc_id:
                    doc['_id'] = doc['id'] = doc_id
                bulk_docs[doc['id']] = doc

            for row in self.couchdb.view('_all_docs',
                                         keys=list(bulk_docs.keys())).rows:
                if row.id in bulk_docs:
                    bulk_docs[row.id]['_rev'] = row['value']['rev']
            try:
                responce = self.couchdb.update(list(bulk_docs.values()))
                for ok, doc_id, rest in responce:
                    if ok:
                        self.logging.info("Saved {}".format(doc_id))
                    else:
                        self.logging.error(
                            "Error on save bulk. Type {}, message {}, doc {}".
                            format(rest, getattr(rest, 'message', ''), doc_id))
            except Exception as e:
                self.logging.error("Uncaught error {} on save bulk".format(
                    e, ))
        else:
            data = event.get(self.kwargs.selection)
            doc_id = data.get('id', data.get('_id'))
            if doc_id:
                data['_id'] = data['id'] = doc_id
                if doc_id in self.couchdb:
                    rev = self.couchdb.get(id).rev
                    data['_rev'] = rev
                    self.logging.debug(
                        "Update revision in data {} to {}".format(id, rev))
            self.couchdb.save(data)
Ejemplo n.º 5
0
def traverseTree( node, db : couchdb.Database ):
    sizeOfBook = len(node.find_all('p'))
    paragraphs = 1
    for child in node.find_all('p') : 
        paragraph = ""
        printProgress( paragraphs, sizeOfBook )
        for string in child.stripped_strings: 
            paragraph = paragraph + " " + string
        for sentence in sentences.splitParagraph( paragraph ):
            # todo: add to couch db the stuf.....
            if sentence and sentence != "" : 
                now = datetime.datetime.now()
                doc = { '_id' : str(uuid.uuid4()), 
                        'type' : 'sentence', 
                        'sentence' : sentence, 
                        'source' : 'foundation', 
                        'date' : now.isoformat() }
                db.save( doc )
        paragraphs = paragraphs + 1
    print("")
Ejemplo n.º 6
0
class AuctionsDataBridge(object):

    """AuctionsDataBridge"""

    def __init__(self, config):
        super(AuctionsDataBridge, self).__init__()
        self.config = config

        self.tenders_url = urljoin(
            self.config_get('tenders_api_server'),
            '/api/{}/tenders'.format(
                self.config_get('tenders_api_version')
            )
        )
        self.tz = tzlocal()
        self.couch_url = urljoin(
            self.config_get('couch_url'),
            self.config_get('auctions_db')
        )
        self.db = Database(self.couch_url,
                           session=Session(retry_delays=range(10)))
        self.url = self.tenders_url

    def config_get(self, name):
        return self.config.get('main').get(name)

    def tender_url(self, tender_id):
        return urljoin(self.tenders_url, 'tenders/{}/auction'.format(tender_id))

    def get_teders_list(self, re_planning=False):
        while True:
            params = {'offset': self.offset,
                      'opt_fields': 'status,auctionPeriod',
                      'mode': '_all_'}
            request_id = generate_request_id(prefix=b'data-bridge-req-')
            logger.debug('Start request to {}, params: {}'.format(
                self.url, params),
                extra={"JOURNAL_REQUEST_ID": request_id})

            response = requests.get(self.url, params=params,
                                    headers={'content-type': 'application/json',
                                             'X-Client-Request-ID': request_id})

            logger.debug('Request response: {}'.format(response.status_code))
            if response.ok:
                response_json = response.json()
                if len(response_json['data']) == 0:
                    logger.info("Change offset date to {}".format(response_json['next_page']['offset']),
                                extra={'MESSAGE_ID': DATA_BRIDGE_PLANNING})
                    self.offset = response_json['next_page']['offset']
                    break
                for item in response_json['data']:
                    if 'auctionPeriod' in item \
                            and 'startDate' in item['auctionPeriod'] \
                            and 'endDate' not in item['auctionPeriod'] \
                            and item['status'] == "active.auction":

                        start_date = iso8601.parse_date(item['auctionPeriod']['startDate'])
                        start_date = start_date.astimezone(self.tz)
                        auctions_start_in_date = startDate_view(
                            self.db,
                            key=(mktime(start_date.timetuple()) + start_date.microsecond / 1E6) * 1000
                        )
                        if datetime.now(self.tz) > start_date:
                            logger.info("Tender {} start date in past. Skip it for planning".format(item['id']),
                                        extra={'MESSAGE_ID': DATA_BRIDGE_PLANNING})
                            continue
                        if re_planning and item['id'] in self.tenders_ids_list:
                            logger.info("Tender {} already planned while replanning".format(item['id']),
                                        extra={'MESSAGE_ID': DATA_BRIDGE_PLANNING})
                            continue
                        elif not re_planning and [row.id for row in auctions_start_in_date.rows if row.id == item['id']]:
                            logger.info("Tender {} already planned on same date".format(item['id']),
                                        extra={'MESSAGE_ID': DATA_BRIDGE_PLANNING})
                            continue
                        yield item

                    if item['status'] == "cancelled":
                        future_auctions = endDate_view(
                            self.db, startkey=time() * 1000
                        )
                        if item["id"] in [i.id for i in future_auctions]:
                            logger.info("Tender {} canceled".format(item["id"]),
                                        extra={'MESSAGE_ID': DATA_BRIDGE_PLANNING})
                            auction_document = self.db[item["id"]]
                            auction_document["current_stage"] = -100
                            auction_document["endDate"] = datetime.now(self.tz).isoformat()
                            self.db.save(auction_document)
                            logger.info("Change auction {} status to 'canceled'".format(item["id"]),
                                        extra={"JOURNAL_REQUEST_ID": request_id,
                                               'MESSAGE_ID': DATA_BRIDGE_PLANNING})

                logger.info(
                    "Change offset date to {}".format(response_json['next_page']['offset']),
                    extra={"JOURNAL_REQUEST_ID": request_id,
                           'MESSAGE_ID': DATA_BRIDGE_PLANNING}
                )
                self.offset = response_json['next_page']['offset']
            else:
                sleep(10)

    def start_auction_worker(self, tender_item):
        result = do_until_success(
            check_output,
            args=([self.config_get('auction_worker'),
                   'planning', str(tender_item['id']),
                   self.config_get('auction_worker_config')],),
        )
        logger.info("Auction planning command result: {}".format(result),
                    extra={'MESSAGE_ID': DATA_BRIDGE_PLANNING_PROCESS})

    def planning_with_couch(self):
        logger.info('Start Auctions Bridge with feed to couchdb',
                    extra={'MESSAGE_ID': DATA_BRIDGE_PLANNING})
        logger.info('Start data sync...',
                    extra={'MESSAGE_ID': DATA_BRIDGE_PLANNING})
        self.planned_tenders = {}
        self.last_seq_id = 0
        while True:
            do_until_success(self.handle_continuous_feed)

    def handle_continuous_feed(self):
        change = self.db.changes(feed='continuous', filter="auctions/by_startDate",
                                 since=self.last_seq_id, include_docs=True)
        for tender_item in change:
            if 'id' in tender_item:
                start_date = tender_item['doc']['stages'][0]['start']
                if tender_item['doc'].get("current_stage", "") == -100:
                    continue

                if tender_item['doc'].get("mode", "") == "test":
                    logger.info('Sciped test auction {}'.format(tender_item['id']),
                                extra={'MESSAGE_ID': DATA_BRIDGE_PLANNING})
                    continue

                if tender_item['id'] in self.planned_tenders and \
                        self.planned_tenders[tender_item['id']] == start_date:
                    logger.debug('Tender {} filtered'.format(tender_item['id']))
                    continue
                logger.info('Tender {} selected for planning'.format(tender_item['id']),
                            extra={'MESSAGE_ID': DATA_BRIDGE_PLANNING})
                self.start_auction_worker(tender_item)
                self.planned_tenders[tender_item['id']] = start_date
            elif 'last_seq' in tender_item:
                self.last_seq_id = tender_item['last_seq']

        logger.info('Resume data sync...',
                    extra={'MESSAGE_ID': DATA_BRIDGE_PLANNING})

    def run(self):
        logger.info('Start Auctions Bridge',
                    extra={'MESSAGE_ID': DATA_BRIDGE_PLANNING})
        self.offset = ''
        logger.info('Start data sync...',
                    extra={'MESSAGE_ID': DATA_BRIDGE_PLANNING})
        while True:
            for tender_item in self.get_teders_list():
                logger.debug('Tender {} selected for planning'.format(tender_item))
                self.start_auction_worker(tender_item)
                sleep(2)
            logger.info('Sleep...',
                        extra={'MESSAGE_ID': DATA_BRIDGE_PLANNING})
            sleep(100)
            logger.info('Resume data sync...',
                        extra={'MESSAGE_ID': DATA_BRIDGE_PLANNING})

    def run_re_planning(self):
        self.re_planning = True
        self.tenders_ids_list = []
        self.offset = ''
        logger.info('Start Auctions Bridge for re-planning...',
                    extra={'MESSAGE_ID': DATA_BRIDGE_RE_PLANNING})
        for tender_item in self.get_teders_list(re_planning=True):
            logger.debug('Tender {} selected for re-planning'.format(tender_item))
            self.start_auction_worker(tender_item)
            self.tenders_ids_list.append(tender_item['id'])
            sleep(1)
        logger.info("Re-planning auctions finished",
                    extra={'MESSAGE_ID': DATA_BRIDGE_RE_PLANNING})
Ejemplo n.º 7
0
from json import load
from couchdb import Database
db = Database("http://localhost:5985/standards")
children = load(open("D10003FC.json"))
data = {"_id": "english", "children": children, "title": "english", "description": "english"}
db.save(data)
children = load(open("D100011F.json"))
data = {"_id": "math", "children": children, "title": "math", "description": "math"}
db.save(data)
Ejemplo n.º 8
0
from requests import get
from couchdb import Database
db = Database("http://localhost:5985/lr-data")

split_on = ", supported by"

page = 0

url = "http://12.109.40.31/search?terms=grade&page={0}"

data = get(url.format(page)).json()

while len(data) > 0:
    for item in data:
        if item['publisher'] is not None and split_on in item['publisher']:
            parts = [x.strip() for x in item['publisher'].split(split_on)]
            if parts[0] == parts[1]:
                print(parts)
                doc = db[item['_id']]
                item['publisher'] = parts[0]
                doc.update(item)
                print(db.save(doc))
    page += 1
    data = get(url.format(page)).json()
Ejemplo n.º 9
0
class EdgeDataBridge(object):
    """Edge Bridge"""
    def __init__(self, config):
        super(EdgeDataBridge, self).__init__()
        self.config = config
        self.api_host = self.config_get('tenders_api_server')
        self.api_version = self.config_get('tenders_api_version')
        self.retrievers_params = self.config_get('retrievers_params')

        try:
            self.client = TendersClient(host_url=self.api_host,
                                        api_version=self.api_version,
                                        key='')
        except MissingSchema:
            raise DataBridgeConfigError(
                'In config dictionary empty or missing \'tenders_api_server\'')
        except ConnectionError as e:
            raise e

        self.couch_url = urljoin(self.config_get('couch_url'),
                                 self.config_get('public_db'))
        self.db = Database(self.couch_url,
                           session=Session(retry_delays=range(10)))
        try:
            self.db.info()
        except ResourceNotFound:
            error_message = "Database with name '" + self.config_get(
                'public_db') + "' doesn\'t exist"
            raise DataBridgeConfigError(error_message)
        except error as e:
            if e.errno == errno.ECONNREFUSED:
                raise DataBridgeConfigError(
                    "Connection refused: 'couch_url' is invalid in config dictionary"
                )
        except AttributeError as e:
            raise DataBridgeConfigError(
                '\'couch_url\' is missed or empty in config dictionary.')
        except KeyError as e:
            if e.message == 'db_name':
                raise DataBridgeConfigError(
                    '\'public_db\' name is missed or empty in config dictionary'
                )

    def config_get(self, name):
        try:
            return self.config.get('main').get(name)
        except AttributeError as e:
            raise DataBridgeConfigError(
                'In config dictionary missed section \'main\'')

    def get_teders_list(self):
        for item in get_tenders(host=self.api_host,
                                version=self.api_version,
                                key='',
                                extra_params={'mode': '_all_'},
                                retrievers_params=self.retrievers_params):
            yield (item["id"], item["dateModified"])

    def save_tender_in_db(self, tender_id, date_modified):
        tender_doc = self.db.get(tender_id)
        if tender_doc:
            if tender_doc['dateModified'] == date_modified:
                return
        tender = self.client.get_tender(tender_id).get('data')
        if tender:
            tender['_id'] = tender_id
            tender['doc_type'] = 'Tender'
            if tender_doc:
                tender['_rev'] = tender_doc['_rev']
                logger.info('Update tender {} '.format(tender_id))
            else:
                logger.info('Save tender {} '.format(tender_id))
            try:
                self.db.save(tender)
            except Exception as e:
                logger.info(
                    'Saving tender {} fail with error {}'.format(
                        tender_id, e.message),
                    extra={'MESSAGE_ID': 'edge_bridge_fail_save_in_db'})
        else:
            logger.info('Tender {} not found'.format(tender_id))

    def run(self):
        logger.info('Start Edge Bridge',
                    extra={'MESSAGE_ID': 'edge_bridge_start_bridge'})
        logger.info('Start data sync...',
                    extra={'MESSAGE_ID': 'edge_bridge__data_sync'})
        for tender_id, date_modified in self.get_teders_list():
            self.save_tender_in_db(tender_id, date_modified)
Ejemplo n.º 10
0
def setSentenceAsVisited(db: couchdb.Database, sentenceId: str):
    sentence = db[sentenceId]
    sentence['procesed'] = True
    db.save(sentence)
Ejemplo n.º 11
0
def set_urls_as_not_visited(db: couchdb.Database, not_visited_view):
    for url in db.iterview(not_visited_view, 100):
        urlDoc = db[url.id]
        urlDoc['visited'] = False
        db.save(urlDoc)
Ejemplo n.º 12
0
from requests import get
from couchdb import Database
db = Database("http://localhost:5985/lr-data")

split_on = ", supported by"

page = 0

url = "http://12.109.40.31/search?terms=grade&page={0}"

data = get(url.format(page)).json()


while len(data) > 0:
    for item in data:
        if item['publisher'] is not None and split_on in item['publisher']:
            parts = [x.strip() for x in item['publisher'].split(split_on)]
            if parts[0] == parts[1]:
                print(parts)
                doc = db[item['_id']]
                item['publisher'] = parts[0]
                doc.update(item)
                print(db.save(doc))
    page += 1
    data = get(url.format(page)).json()