Example #1
0
    def log_context_attributes(ctx: Context) -> None:
        """Get the context attributes stored on context

        Args:
            ctx: The behave context

        """
        LOGGER.debug("Current attributes saved to context:")
        if ctx.config.logging_level == 10:
            # For each layer of the context log the layer and make a table of attributes
            print("\n")
            for layer in ctx._stack:
                print(
                    ansicolor.blue(
                        "-------------------------------------------------------------------------"
                    ))
                print(ansicolor.blue(f"CONTEXT LAYER: {layer['@layer']}"))
                print(
                    ansicolor.blue(
                        "-------------------------------------------------------------------------"
                    ))
                # pretty print out each key value pair in a table
                for key in layer:
                    # Don't log standard attributes that are none
                    if layer[key] is not None:
                        print("%s %s| %r" % (key, " " *
                                             (30 - len(key)), layer[key]))
                print(
                    ansicolor.blue(
                        "-------------------------------------------------------------------------"
                    ))
            print("\n")
Example #2
0
def restoreDump(args):
    #Setup raw YakDB connection
    conn = YakDB.Connection()
    conn.connect(args.req_endpoint)
    #Filenames to dump to
    filenames = __getDumpFilenames(args)
    #NOTE: Partial & incremental restore is supported
    #Restory every table if the corresponding file exists
    if not args.no_documents:
        if not os.path.isfile(filenames[0]):
            print (red("Can't find document table file " + filenames[0], bold=True))
        else: #It's a regular file
            print (blue("Restoring document table from " + filenames[0], bold=True))
            importYDFDump(conn, filenames[0], 1)
    if not args.no_entities:
        if not os.path.isfile(filenames[1]):
            print (red("Can't find entity table file " + filenames[1], bold=True))
        else: #It's a regular file
            print (blue("Restoring entity table from " + filenames[1], bold=True))
            importYDFDump(conn, filenames[1], 2)
    if not args.no_document_idx:
        if not os.path.isfile(filenames[2]):
            print (red("Can't find document index table file " + filenames[2], bold=True))
        else: #It's a regular file
            print (blue("Restoring document index table from " + filenames[2], bold=True))
            importYDFDump(conn, filenames[2], 3)
    if not args.no_entity_idx:
        if not os.path.isfile(filenames[3]):
            print (red("Can't find document index table file " + filenames[3], bold=True))
        else: #It's a regular file
            print (blue("Restoring entity index table from " + filenames[3], bold=True))
            importYDFDump(conn, filenames[3], 4)
Example #3
0
def truncate(args):
    "Delete data from one or more tables"
    #Check if the user is sure
    if not args.yes_i_know_what_i_am_doing:
        print (red("This will delete all your Translatron data. If you are sure, please use --yes-i-know-what-i-am-doing ", bold=True))
        return
    #Setup raw YakDB connection
    conn = YakDB.Connection()
    conn.connect(args.req_endpoint)
    #
    #Restory every table if the corresponding file exists
    if not args.no_documents:
        print (blue("Truncating document table... ", bold=True))
        if args.hard: conn.truncateTable(1)
        else: conn.deleteRange(1, None, None, None)
    if not args.no_entities:
        print (blue("Truncating entity table... ", bold=True))
        if args.hard: conn.truncateTable(2)
        else: conn.deleteRange(2, None, None, None)
    if not args.no_document_idx:
        print (blue("Truncating document index table... ", bold=True))
        if args.hard: conn.truncateTable(3)
        else: conn.deleteRange(3, None, None, None)
    if not args.no_entity_idx:
        print (blue("Truncating entity index table... ", bold=True))
        if args.hard: conn.truncateTable(4)
        else: conn.deleteRange(4, None, None, None)
Example #4
0
def checkConnection(args):
    import YakDB
    #Check request/reply connection
    print (blue("Checking request/reply connection...", bold=True))
    conn = YakDB.Connection()
    conn.connect(args.req_endpoint)
    #Request server info
    print((conn.serverInfo()).decode("utf-8"))
    print(green("REQ/REP connection attempt successful"))
    #Check push/pull connection
    print (blue("Checking push/pull connection...", bold=True))
    print(green("PUSH/PULL connection attempt successful"))
    conn = YakDB.Connection()
    conn.usePushMode()
    conn.connect(args.push_endpoint)
def startWebsocketServer():
    print(blue("Websocket server starting up..."))

    try:
        import asyncio
    except ImportError:
        ## Trollius >= 0.3 was renamed
        import trollius as asyncio

    #Asyncio only setups an event loop in the main thread, else we need to
    if threading.current_thread().name != 'MainThread':
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)

    factory = WebSocketServerFactory("ws://0.0.0.0:9000", debug = False)
    factory.protocol = TranslatronProtocol

    loop = asyncio.get_event_loop()
    server = loop.create_server(factory, '0.0.0.0', 9000)
    server = loop.run_until_complete(server)

    try:
        loop.run_forever()
    except KeyboardInterrupt:
        pass
    finally:
        server.close()
        loop.close()
def startWebsocketServer():
    print(blue("Websocket server starting up..."))

    try:
        import asyncio
    except ImportError:
        ## Trollius >= 0.3 was renamed
        import trollius as asyncio

    #Asyncio only setups an event loop in the main thread, else we need to
    if threading.current_thread().name != 'MainThread':
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)

    factory = WebSocketServerFactory("ws://0.0.0.0:9000", debug=False)
    factory.protocol = TranslatronProtocol

    loop = asyncio.get_event_loop()
    server = loop.create_server(factory, '0.0.0.0', 9000)
    server = loop.run_until_complete(server)

    try:
        loop.run_forever()
    except KeyboardInterrupt:
        pass
    finally:
        server.close()
        loop.close()
    def parse(self, response):
        # Extract fields
        title = LAW.TITLE.xt(response)
        parl_id = LAW.PARL_ID.xt(response)
        status = LAW.STATUS.xt(response)

        LLP = LegislativePeriod.objects.get(
            roman_numeral=response.url.split('/')[-4])

        # Extract foreign keys
        category = LAW.CATEGORY.xt(response)
        description = LAW.DESCRIPTION.xt(response)

        # Create category if we don't have it yet
        cat, created = Category.objects.get_or_create(title=category)
        if created:
            log.msg(u"Created category {}".format(
                green(u'[{}]'.format(category))))

        # Create and save Law
        law_data = {
            'title': title,
            'status': status,
            'description': description
        }
        law_item, law_created = Law.objects.update_or_create(
            parl_id=parl_id,
            legislative_period=LLP,
            source_link=response.url,
            defaults=law_data)

        # Attach foreign keys
        law_item.keywords = self.parse_keywords(response)
        law_item.category = cat
        law_item.documents = self.parse_docs(response)

        law_item.save()

        # Log our progress
        if law_created:
            logtext = u"Created {} with id {}, LLP {} @ {}"
        else:
            logtext = u"Updated {} with id {}, LLP {} @ {}"

        logtext = logtext.format(
            red(title),
            cyan(u"[{}]".format(parl_id)),
            green(str(LLP)),
            blue(response.url)
        )
        log.msg(logtext, level=log.INFO)

        response.meta['law_item'] = law_item

        # is the tab 'Parlamentarisches Verfahren available?'
        if response.xpath('//h2[@id="tab-ParlamentarischesVerfahren"]'):
            self.parse_parliament_steps(response)

        if response.xpath('//h2[@id="tab-VorparlamentarischesVerfahren"]'):
            self.parse_pre_parliament_steps(response)
Example #8
0
    def parse_inquiry_response(self, response):
        """
        Callback function for parsing the inquiry responses
        """
        inquiry_item = response.meta['inquiry_item']
        source_link = response.url
        parl_id = response.url.split('/')[-2]
        title = INQUIRY.TITLE.xt(response)
        description = INQUIRY.RESPONSEDESCRIPTION.xt(response)
        LLP = inquiry_item.legislative_period
        category = INQUIRY.CATEGORY.xt(response)

        # Get or create Category object for the inquiry and log to screen if new
        # category is created.
        cat, created = Category.objects.get_or_create(title=category)
        if created:
            log.msg(u"Created category {}".format(
                green(u'[{}]'.format(category))))

        try:
            sender_object = Person.objects.get(
                parl_id=INQUIRY.RESPONSESENDER.xt(response))
        except:
            log.msg(
                red(u'Receiver was not found in database, skipping Inquiry {} in LLP {}'
                    .format(parl_id, LLP)))
            return

        # Create or update Inquiry item
        inquiryresponse_item, inquiryresponse_created = InquiryResponse.objects.update_or_create(
            parl_id=parl_id,
            legislative_period=LLP,
            defaults={
                'title': title,
                'source_link': source_link,
                'description': description,
                'sender': sender_object
            })

        # Attach foreign Keys
        inquiryresponse_item.documents = self.parse_response_docs(response)
        inquiryresponse_item.category = cat

        # Save InquiryResponse object
        inquiryresponse_item.save()

        if inquiryresponse_created:
            logtext = u"[{} of {}] Created InquiryResponse {} with ID {}, LLP {} @ {}"
        else:
            logtext = u"[{} of {}] Updated InquiryResponse {} with ID {}, LLP {} @ {}"

        logtext = logtext.format(self.SCRAPED_COUNTER, self.TOTAL_COUNTER,
                                 cyan(title), cyan(u"{}".format(parl_id)),
                                 green(str(LLP)), blue(response.url))
        log.msg(logtext, level=log.INFO)

        inquiry_item.response = inquiryresponse_item
        inquiry_item.save()

        return
Example #9
0
def importEntities(args):
    for infile in args.infile:
        basename = os.path.basename(infile)
        if re.match(r"uniprot_[a-z]+\.dat\.gz", basename):
            print(blue("Importing UniProt file..."))
            from Translatron.Entities.UniProtImporter import importUniprot
            importUniprot(args, infile)
        elif re.match(r"d\d{4}.bin", basename):
            print(blue("Importing MeSH file..."))
            from Translatron.Entities.MeSHImporter import importMeSH
            importMeSH(args, infile)
        elif re.match(r"[a-z][a-z]wiki.+titles.+\.gz", basename):
            print(blue("Importing Wikipedia page title file..."))
            from Translatron.Entities.WikipediaImporter import importWikimediaPagelist
            importWikimediaPagelist(args, infile)
        else:
            print (red("Can't interpret entity input file (uniprot_sprot.dat.gz - UniProt) %s " % basename))
Example #10
0
def print_section(text: str, color: bool = False) -> None:
    if color:
        text = ansicolor.blue(text.ljust(60, ' '), reverse=True)
        line = ''
    else:
        line = '\n' + ''.ljust(60, '-')

    print('\n' + text + line)
Example #11
0
def test_colordiff():
    x, y = colordiff("hi bob", "hi there",
                     color_x=Colors.Red, color_y=Colors.Blue)

    fx = lambda s: red(s, reverse=True)
    fy = lambda s: blue(s, reverse=True)

    assert x == "hi " + fx("b") + fx("o") + fx("b")
    assert y == "hi " + fy("t") + fy("h") + fy("e") + fy("r") + fy("e")
Example #12
0
def compact(args):
    "Compact one ore more table"
    #Setup raw YakDB connection
    conn = YakDB.Connection()
    conn.connect(args.req_endpoint)
    #Restory every table if the corresponding file exists
    if not args.no_documents:
        print (blue("Compacting document table... ", bold=True))
        conn.compactRange(1)
    if not args.no_entities:
        print (blue("Compacting entity table... ", bold=True))
        conn.compactRange(2)
    if not args.no_document_idx:
        print (blue("Compacting document index table... ", bold=True))
        conn.compactRange(3)
    if not args.no_entity_idx:
        print (blue("Compacting entity index table... ", bold=True))
        conn.compactRange(4)
Example #13
0
def test_colordiff():
    x, y = colordiff("hi bob", "hi there",
                     color_x=Colors.Red, color_y=Colors.Blue)

    fx = lambda s: red(s, reverse=True)
    fy = lambda s: blue(s, reverse=True)

    assert x == "hi " + fx("b") + fx("o") + fx("b")
    assert y == "hi " + fy("t") + fy("h") + fy("e") + fy("r") + fy("e")
Example #14
0
def exportDump(args):
    #Setup raw YakDB connection
    conn = YakDB.Connection()
    conn.connect(args.req_endpoint)
    #Filenames to dump to
    filenames = __getDumpFilenames(args)
    #Dump every table
    if not args.no_documents:
        print (blue("Dumping document table to " + filenames[0], bold=True))
        dumpYDF(conn, filenames[0], 1)
    if not args.no_entities:
        print (blue("Dumping entity table to " + filenames[1], bold=True))
        dumpYDF(conn, filenames[1], 2)
    if not args.no_document_idx:
        print (blue("Dumping document index table to " + filenames[2], bold=True))
        dumpYDF(conn, filenames[2], 3)
    if not args.no_entity_idx:
        print (blue("Dumping entity index table to " + filenames[3], bold=True))
        dumpYDF(conn, filenames[3], 4)
Example #15
0
def startHTTPServer(http_port=8080):
    print(blue("HTTP server starting up, listening on port %d..." % http_port))
    #Fixes six issue: http://goo.gl/ebeWDN
    cherrypy.config.update({'engine.autoreload.on': False})
    conf = {
        'global': {'server.socket_host': '0.0.0.0', 'server.socket_port': http_port},
        '/': {
            'tools.staticdir.on': True,
            'tools.staticdir.dir': os.path.join(os.path.abspath(os.getcwd()), "static"),
            'tools.staticdir.index' : 'index.html',
        },
    }
    print("Static: " + os.path.join(os.path.abspath(os.getcwd()), "static"))
    cherrypy.quickstart(TranslatronServer(), "/", conf)
Example #16
0
def f_check():
    global timeout, base_url
    timeout = args.timeout
    logging.captureWarnings(True)  # Capture the ssl warnings with the standard logging module

    if args.ssl:
        base_url = "https://{}:{}/{}".format(args.host, args.port, args.url)
    else:
        base_url = "http://{}:{}/{}".format(args.host, args.port, args.url)

    f_verbose("[*] Program will check out WebLogic for CVE-2017-3506 & 10271 vulnerability.")

    if f_run():
        print red("[x]") + " Your system is potentially vulnerable to XML Serialization attack!"
    else:
        print green("[*]") + " Your system is " + blue("safe!")
Example #17
0
def invoke(cwd, args):
    logger.debug("Invoking: [%s] '%s'" % (cwd, ' '.join(args)))
    popen = subprocess.Popen(args, cwd=cwd,
                             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    (out, err) = popen.communicate()
    out = str(out).strip()
    err = str(err).strip()
    ret = popen.returncode
    if out:
        lines = out.split('\n')
        for line in lines:
            logger.debug("> %s" % ansicolor.blue(line))
    if ret:
        if not err:
            err = ret
        logger.warn("Returned %s" % (err))
    return ret, out, err
def readAndProcessXLIFF(lang,
                        filename,
                        fileid,
                        indexer,
                        autotranslator,
                        upload=False,
                        approve=False,
                        autotranslate=True,
                        preindex=False,
                        overwrite=False,
                        fullauto_account=False,
                        postproc=identity):
    soup = parse_xliff_file(filename)
    autotranslated_count = process_xliff_soup(filename,
                                              soup,
                                              autotranslator,
                                              indexer,
                                              autotranslate=autotranslate,
                                              preindex=preindex,
                                              overwrite=overwrite,
                                              postproc=postproc)
    # If we are not autotranslating, stop here, no need to export
    if not autotranslate:
        return 0
    # Export XLIFF
    outdir = "output-{}".format(lang)
    outfilename = filename.replace("cache/{}".format(lang), outdir)
    # Create directories & export
    if autotranslated_count > 0:
        os.makedirs(os.path.dirname(outfilename), exist_ok=True)
        #print(black("Exporting to {}".format(outfilename), bold=True))
        export_xliff_file(soup, outfilename)
    # Upload if enabled
    if upload and autotranslated_count > 0:
        basename = os.path.basename(filename)
        print(blue("Uploading {} (approve={})...".format(basename, approve)))
        upload_file(outfilename,
                    fileid,
                    auto_approve=approve,
                    lang=lang,
                    fullauto_account=fullauto_account,
                    duplicates=overwrite)
        print(green("Uploaded {}".format(basename)))
    return autotranslated_count
Example #19
0
def invoke(cwd, args):
    logger.debug("Invoking: [%s] '%s'" % (cwd, ' '.join(args)))
    popen = subprocess.Popen(args,
                             cwd=cwd,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE)
    (out, err) = popen.communicate()
    out = str(out).strip()
    err = str(err).strip()
    ret = popen.returncode
    if out:
        lines = out.split('\n')
        for line in lines:
            logger.debug("> %s" % ansicolor.blue(line))
    if ret:
        if not err:
            err = ret
        logger.warn("Returned %s" % (err))
    return ret, out, err
Example #20
0
def startHTTPServer(http_port=8080):
    print(blue("HTTP server starting up, listening on port %d..." % http_port))
    #Fixes six issue: http://goo.gl/ebeWDN
    cherrypy.config.update({'engine.autoreload.on': False})
    conf = {
        'global': {
            'server.socket_host': '0.0.0.0',
            'server.socket_port': http_port
        },
        '/': {
            'tools.staticdir.on':
            True,
            'tools.staticdir.dir':
            os.path.join(os.path.abspath(os.getcwd()), "static"),
            'tools.staticdir.index':
            'index.html',
        },
    }
    print("Static: " + os.path.join(os.path.abspath(os.getcwd()), "static"))
    cherrypy.quickstart(TranslatronServer(), "/", conf)
def f_verbose(value):
    if ("[X]" in value) or ("[+]" in value):
        f_save(value + '\n')

    col_cred = value.split('`')
    neutrino = ''

    for index, item in enumerate(col_cred):
        if index & 1:
            neutrino = neutrino + blue(item)
        else:
            neutrino += item
    if "[X]" in neutrino:
        print neutrino.replace("[X]", red("[X]"))
    elif "[+]" in neutrino:
        print neutrino.replace("[+]", yellow("[+]"))
    elif args.verbose:
        print neutrino.replace("[*]",
                               green("[*]")).replace("[!]", magenta("[!]"))

    return
Example #22
0
def parse_check_format(intv, verbose=False):
    """
    Checks if a string could contain an interval. This function just exists to
    shorten read_timeline() to something that Python won't throw warnings about.
    There's no need to call this outside that function.
        :param intv: is the string to check.
        :param verbose: is an optional flag.
            When true, extra parsing information is printed to the console. Defaults to false.
    """
    # if the string is shorter than 5 characters, it can't be [#] [#] [#], since that's
    # 3 numbers + 2 spaces. if it is less than 5, it's not an interval
    if len(intv) < 5:
        if verbose:
            print(black("\tinvalid length > ", bold=True), end='')
            print(add_quotes(intv))
        return False

    # haven't ruled it out yet
    if verbose:
        print(blue("\tpossible interval > ", bold=True), end='')
        print(add_quotes(intv))
    return True
    def parse_parliament_steps(self, response):
        """
        Callback function to parse the additional 'Parlamentarisches Verfahren'
        page
        """
        law_item = response.meta['law_item']

        phases = LAW.PHASES.xt(response)

        for phase in phases:
            # Create phase if we don't have it yet
            phase_item, created = Phase.objects.get_or_create(
                title=phase['title'])
            if created:
                log.msg(u"Created Phase {}".format(
                    green(u'[{}]'.format(phase_item.title))))

            # Create steps
            for step in phase['steps']:
                step_item, created = Step.objects.update_or_create(
                    title=step['title']['text'],
                    sortkey=step['sortkey'],
                    date=step['date'],
                    protocol_url=step['protocol_url']
                        if step['protocol_url'] else u'',
                    law=law_item,
                    phase=phase_item,
                    source_link=response.url
                )
                step_item.save()

                # Save statements for this step, if applicable
                if 'statements' in step['title']:
                    for stmnt in step['title']['statements']:
                        # Find the person
                        pq = Person.objects.filter(
                            source_link__endswith=stmnt['person_source_link'])
                        if pq.exists() and pq.count() == 1:
                            person_item = pq.first()
                            st_data = {
                                'speech_type': stmnt['statement_type'],
                                'protocol_url': stmnt['protocol_link'][0] if stmnt['protocol_link'] else None
                            }

                            st_item, st_created = Statement.objects.update_or_create(
                                index=stmnt['index'],
                                person=person_item,
                                step=step_item,
                                defaults=st_data)
                            # if st_created:
                            #     log.msg(u"Created Statement by {} on {}".format(
                            #         green(
                            #             u'[{}]'.format(person_item.full_name)),
                            #         step_item.date))
                            # else:
                            #     log.msg(u"Updated Statement by {} on {}".format(
                            #         green(
                            #             u'[{}]'.format(person_item.full_name)),
                            #         step_item.date))
                        else:
                            # We can't save statements if we can't find the
                            # Person
                            log.msg(
                                red(u"Skipping Statement by {}: Person with source_link {} does{} exist{}").format(
                                    green(
                                        u'[{}]'.format(stmnt['person_name'])),
                                    blue(
                                        "[{}]".format(stmnt['person_source_link'])),
                                    red("{}").format(
                                        "" if pq.exists() else " not"),
                                    "" if pq.count() > 1 else ", but {} persons matching found!".format(
                                        pq.count())
                                ))
                            continue
Example #24
0
class InquiriesSpider(BaseSpider):
    BASE_URL = "{}/{}".format(BASE_HOST, "PAKT/JMAB/filter.psp")

    URLOPTIONS = {
        'view': 'RSS',
        'jsMode': 'RSS',
        'xdocumentUri': '/PAKT/JMAB/index.shtml',
        'NRBR': 'NR',
        'anwenden': 'Anwenden',
        'JMAB': 'J_JPR_M',
        'VHG2': 'ALLE',
        'SUCH': '',
        'listeId': '105',
        'FBEZ': 'FP_005'
    }

    name = "inquiries"
    inquiries_scraped = []

    def __init__(self, **kw):
        super(InquiriesSpider, self).__init__(**kw)

        if 'llp' in kw:
            try:
                self.LLP = [int(kw['llp'])]
            except:
                pass

        self.cookies_seen = set()
        self.idlist = {}
        self.url_override = kw.get('url', None)

    def start_requests(self):
        """
        Returns a list of URLs to scrape
        """
        # This predefined list of URLs is chosen to include all types of
        # inquiries possible in the Austrian parliament in order to provide a
        # suitable testing surface for new functions.
        # urls = ["https://www.parlament.gv.at/PAKT/VHG/XXV/JPR/JPR_00019/index.shtml", "https://www.parlament.gv.at/PAKT/VHG/XXV/JPR/JPR_00016/index.shtml", "https://www.parlament.gv.at/PAKT/VHG/XXV/J/J_06954/index.shtml", "https://www.parlament.gv.at/PAKT/VHG/XXV/M/M_00178/index.shtml", "https://www.parlament.gv.at/PAKT/VHG/XXV/JEU/JEU_00003/index.shtml", "https://www.parlament.gv.at/PAKT/VHG/XXV/J/J_06758/index.shtml", "https://www.parlament.gv.at/PAKT/VHG/BR/J-BR/J-BR_03089/index.shtml",
        #         "https://www.parlament.gv.at/PAKT/VHG/BR/J-BR/J-BR_03091/index.shtml", "http://www.parlament.gv.at/PAKT/VHG/BR/J-BR/J-BR_01155/index.shtml", "http://www.parlament.gv.at/PAKT/VHG/XX/J/J_06110/index.shtml", "http://www.parlament.gv.at/PAKT/VHG/XX/J/J_06651/index.shtml", "http://www.parlament.gv.at/PAKT/VHG/XX/J/J_04024/index.shtml", "http://www.parlament.gv.at/PAKT/VHG/XX/J/J_04025/index.shtml", "https://www.parlament.gv.at/PAKT/VHG/XX/M/M_00178/index.shtml"]
        urls = [] if not self.url_override else [self.url_override]

        if self.LLP and not self.url_override:
            for i in self.LLP:
                for nrbr in ['NR', 'BR']:
                    roman_numeral = roman.toRoman(i)
                    options = self.URLOPTIONS.copy()
                    options['GP'] = roman_numeral
                    options['NRBR'] = nrbr
                    url_options = urlencode(options)
                    url_llp = "{}?{}".format(self.BASE_URL, url_options)
                    rss = feedparser.parse(url_llp)

                    self.logger.info("GP {}: {} inquiries from {}".format(
                        roman_numeral, len(rss['entries']), nrbr)
                    )
                    urls = urls + [entry['link'] for entry in rss['entries']]
        self.TOTAL_COUNTER = len(urls)
        for url in urls:
            yield self.make_requests_from_url(url)

    def parse(self, response):
        self.SCRAPED_COUNTER += 1

        source_link = response.url
        category = INQUIRY.CATEGORY.xt(response)
        parl_id = response.url.split('/')[-2]
        title = INQUIRY.TITLE.xt(response)
        description = INQUIRY.DESCRIPTION.xt(response)
        sender_objects = []
        callback_requests = []
        ts = GENERIC.TIMESTAMP.xt(response)

        # Inquiries from Bundesrat don't have an LLP => set None
        if("BR" in category):
            LLP = None
        else:
            LLP = LegislativePeriod.objects.get(
                roman_numeral=response.url.split('/')[-4])
        if not self.IGNORE_TIMESTAMP and not self.has_changes(parl_id, LLP, response.url, ts):
            self.logger.debug(
                green(u"[{} of {}] Skipping Inquiry, no changes: {}".format(
                    self.SCRAPED_COUNTER,
                    self.TOTAL_COUNTER,
                    title)))
            return

        # Get or create Category object for the inquiry and log to screen if new
        # category is created.
        cat, created = Category.objects.get_or_create(title=category)
        if created:
            log.debug(u"Created category {}".format(
                green(u'[{}]'.format(category))))

        # An inquiry can have multiple senders, but only a single recipient.
        # Try/catch in case person does not exist in the database.
        try:
            for sender_object in INQUIRY.SENDER.xt(response):
                sender_objects.append(Person.objects.get(
                    parl_id=sender_object))
        except:
            log.warning(red(u'Sender "{}" was not found in database, skipping Inquiry {} in LLP {}'.format(
                INQUIRY.SENDER.xt(response), parl_id, LLP)))
            return
        try:
            receiver_object = Person.objects.get(
                parl_id=INQUIRY.RECEIVER.xt(response))
        except:
            log.warning(red(u'Receiver "{}" was not found in database, skipping Inquiry {} in LLP {}'.format(
                INQUIRY.RECEIVER.xt(response), parl_id, LLP)))
            return

        # Create or update Inquiry item
        inquiry_item, inquiry_created = Inquiry.objects.update_or_create(
            parl_id=parl_id,
            legislative_period=LLP,
            defaults={
                'title': title,
                'source_link': source_link,
                'description': description,
                'receiver': receiver_object,
                'ts': ts
            }
        )

        if inquiry_created:
            inquiry_item.status = 'offen'

        # Attach foreign keys
        inquiry_item.keywords = self.parse_keywords(response)
        inquiry_item.documents = self.parse_docs(response)
        inquiry_item.category = cat
        inquiry_item.sender = sender_objects

        response.meta['inquiry_item'] = inquiry_item

        # Dringliche / Urgent inquiries have a different structure for steps
        # and history. This case distinction accomodates these different
        # structures.
        if any("Dringliche" in '{}'.format(s) for s in inquiry_item.keywords.all()):
            if response.xpath('//h2[@id="tab-ParlamentarischesVerfahren"]'):
                self.parse_parliament_steps(response)
        else:
            response_link = self.parse_steps(response)
            if response_link:
                post_req = scrapy.Request("{}{}".format(BASE_HOST, response_link),
                                          callback=self.parse_inquiry_response,
                                          dont_filter=True)
                post_req.meta['inquiry_item'] = inquiry_item

                callback_requests.append(post_req)

        # Save Inquiry item and log to terminal if created or updated.
        inquiry_item.save()

        if inquiry_created:
            logtext = u"[{} of {}] Created Inquiry {} with ID {}, LLP {} @ {}"
        else:
            logtext = u"[{} of {}] Updated Inquiry {} with ID {}, LLP {} @ {}"

        logtext = logtext.format(
            self.SCRAPED_COUNTER,
            self.TOTAL_COUNTER,
            cyan(title),
            cyan(u"{}".format(parl_id)),
            green(unicode(LLP)),
            blue(response.url),
            green(u"{}".format(inquiry_item.keywords))
        )
        log.msg(logtext, level=log.DEBUG if not self.SCRAPED_COUNTER%1000==0 else log.INFO)

        # log.msg(green("Open Callback requests: {}".format(
        #   len(callback_requests))), level=log.INFO)

        return callback_requests

    def has_changes(self, parl_id, legislative_period, source_link, ts):
        if not Inquiry.objects.filter(
            parl_id=parl_id,
            legislative_period=legislative_period,
            source_link=source_link
        ).exists():
            return True

        ts = ts.replace(tzinfo=pytz.utc)
        if Inquiry.objects.get(
                parl_id=parl_id,
                legislative_period=legislative_period,
                source_link=source_link).ts != ts:
            return True
        return False

    def parse_keywords(self, response):
        keywords = INQUIRY.KEYWORDS.xt(response)

        # Create all keywords we don't yet have in the DB
        keyword_items = []
        for keyword in keywords:
            kw, created = Keyword.objects.get_or_create(title=keyword)
            if created:
                log.msg(u"Created keyword {}".format(
                    green(u'[{}]'.format(keyword))),level=log.DEBUG)
            keyword_items.append(kw)

        return keyword_items

    def parse_docs(self, response):

        docs = INQUIRY.DOCS.xt(response)

        # Create all docs we don't yet have in the DB
        doc_items = []
        for document in docs:
            doc, created = Document.objects.get_or_create(
                title=document['title'],
                html_link=document['html_url'],
                pdf_link=document['pdf_url'],
                stripped_html=None
            )
            doc_items.append(doc)
        return doc_items

    def parse_response_docs(self, response):

        docs = INQUIRY.RESPONSEDOCS.xt(response)

        # Create all docs we don't yet have in the DB
        doc_items = []
        for document in docs:
            doc, created = Document.objects.get_or_create(
                title=document['title'],
                html_link=document['html_url'],
                pdf_link=document['pdf_url'],
                stripped_html=None
            )
            doc_items.append(doc)
        return doc_items

    def parse_steps(self, response):
        """
            Callback function to parse the single-page history for normal inquiries
        """
        response_link = []
        inquiry_item = response.meta['inquiry_item']

        # Get or created a default-phase for inquiries, because there are no phases in
        # simple inquiries.
        phase_item, created = Phase.objects.get_or_create(
            title='default_inqu')
        if created:
            log.msg(u"Created Phase {}".format(
                green(u'[{}]'.format(phase_item.title))),level=log.DEBUG)

        steps = INQUIRY.STEPS.xt(response)

        for step in steps:
            if "Schriftliche Beantwortung" in step["title"]:
                response_link = INQUIRY.RESPONSE_LINK.xt(response)

        for step in steps:
            step_item, created = Step.objects.update_or_create(
                title=step['title'],
                sortkey=step['sortkey'],
                date=step['date'],
                protocol_url=step['protocol_url'],
                law=inquiry_item,
                phase=phase_item,
                source_link=response.url
            )
            step_item.save()
        if response_link:
            return response_link
        else:
            return

    def parse_parliament_steps(self, response):
        """
        Callback function to parse the additional 'Parlamentarisches Verfahren'
        page.
        """
        inquiry_item = response.meta['inquiry_item']

        phases = INQUIRY.PHASES.xt(response)

        for phase in phases:
            # Create phase if we don't have it yet
            phase_item, created = Phase.objects.get_or_create(
                title=phase['title'])
            if created:
                log.msg(u"Created Phase {}".format(
                    green(u'[{}]'.format(phase_item.title))),level=log.DEBUG)

            # Create steps
            for step in phase['steps']:
                step_item, created = Step.objects.update_or_create(
                    title=step['title']['text'],
                    sortkey=step['sortkey'],
                    date=step['date'],
                    protocol_url=step['protocol_url'],
                    law=inquiry_item,
                    phase=phase_item,
                    source_link=response.url
                )
                step_item.save()
                if created:
                    log.msg(u"Created Step {}".format(
                        green(u'[{}]'.format(step_item.title))),level=log.DEBUG)

                # Save statements for this step, if applicable
                if 'statements' in step['title']:
                    for stmnt in step['title']['statements']:
                        # Find the person
                        pq = Person.objects.filter(
                            source_link__endswith=stmnt['person_source_link'])
                        if pq.exists() and pq.count() == 1:
                            person_item = pq.first()
                            st_data = {
                                'speech_type': stmnt['statement_type'],
                                'protocol_url': stmnt['protocol_link']
                            }
                            st_item, st_created = Statement.objects.update_or_create(
                                index=stmnt['index'],
                                person=person_item,
                                step=step_item,
                                defaults=st_data)
                            if st_created:
                                log.msg(u"Created Statement by {} on {}".format(
                                    green(
                                        u'[{}]'.format(person_item.full_name)),
                                    step_item.date),level=log.DEBUG)
                            else:
                                log.msg(u"Updated Statement by {} on {}".format(
                                    green(
                                        u'[{}]'.format(person_item.full_name)),
                                    step_item.date),level=log.DEBUG)
                        else:
                            # We can't save statements if we can't find the
                            # Person
                            log.warning(
                                red(u"Skipping Statement by {}: Person with source_link {} does{} exist{}").format(
                                    green(
                                        u'[{}]'.format(stmnt['person_name'])),
                                    blue(
                                        "[{}]".format(stmnt['person_source_link'])),
                                    red("{}").format(
                                        "" if pq.exists() else " not"),
                                    "" if pq.count() > 1 else ", but {} persons matching found!".format(
                                        pq.count())
                                ))
                            continue

    def parse_inquiry_response(self, response):
        """
        Callback function for parsing the inquiry responses
        """
        inquiry_item = response.meta.get('inquiry_item',None) # allow testing single urls for parsing errors
        source_link = response.url
        parl_id = response.url.split('/')[-2]
        title = INQUIRY.TITLE.xt(response)
        description = INQUIRY.RESPONSEDESCRIPTION.xt(response)
        LLP = inquiry_item.legislative_period if inquiry_item else None
        category = INQUIRY.CATEGORY.xt(response)

        # Get or create Category object for the inquiry and log to screen if new
        # category is created.
        cat, created = Category.objects.get_or_create(title=category)
        if created:
            log.msg(u"Created category {}".format(
                green(u'[{}]'.format(category))),level=log.DEBUG)

        try:
            sender_object = Person.objects.get(
                parl_id=INQUIRY.RESPONSESENDER.xt(response))
        except Exception, e:
            log.warning(red(u'Sender "{}" was not found in database, skipping Inquiry {} in LLP {}'.format(
                INQUIRY.RESPONSESENDER.xt(response), parl_id, LLP)))
            return

        if not inquiry_item:
            print locals()
            return # allow testing single urls for parsing errors

        # Create or update Inquiry item
        inquiryresponse_item, inquiryresponse_created = InquiryResponse.objects.update_or_create(
            parl_id=parl_id,
            legislative_period=LLP,
            defaults={
                'title': title,
                'source_link': source_link,
                'description': description,
                'sender': sender_object
            }
        )

        # Attach foreign Keys
        inquiryresponse_item.documents = self.parse_response_docs(response)
        inquiryresponse_item.category = cat

        # Save InquiryResponse object
        inquiryresponse_item.save()

        if inquiryresponse_created:
            logtext = u"[{} of {}] Created InquiryResponse {} with ID {}, LLP {} @ {}"
        else:
            logtext = u"[{} of {}] Updated InquiryResponse {} with ID {}, LLP {} @ {}"

        logtext = logtext.format(
            self.SCRAPED_COUNTER,
            self.TOTAL_COUNTER,
            cyan(title),
            cyan(u"{}".format(parl_id)),
            green(unicode(LLP)),
            blue(response.url)
        )
        log.msg(logtext, level=log.DEBUG if self.SCRAPED_COUNTER!=0 else log.INFO)

        inquiry_item.response = inquiryresponse_item
        inquiry_item.status = 'response_received'
        inquiry_item.save()

        return
Example #25
0
    def parse_parliament_steps(self, response):
        """
        Callback function to parse the additional 'Parlamentarisches Verfahren'
        page.
        """
        inquiry_item = response.meta['inquiry_item']

        phases = INQUIRY.PHASES.xt(response)

        for phase in phases:
            # Create phase if we don't have it yet
            phase_item, created = Phase.objects.get_or_create(
                title=phase['title'])
            if created:
                log.msg(u"Created Phase {}".format(
                    green(u'[{}]'.format(phase_item.title))),level=log.DEBUG)

            # Create steps
            for step in phase['steps']:
                step_item, created = Step.objects.update_or_create(
                    title=step['title']['text'],
                    sortkey=step['sortkey'],
                    date=step['date'],
                    protocol_url=step['protocol_url'],
                    law=inquiry_item,
                    phase=phase_item,
                    source_link=response.url
                )
                step_item.save()
                if created:
                    log.msg(u"Created Step {}".format(
                        green(u'[{}]'.format(step_item.title))),level=log.DEBUG)

                # Save statements for this step, if applicable
                if 'statements' in step['title']:
                    for stmnt in step['title']['statements']:
                        # Find the person
                        pq = Person.objects.filter(
                            source_link__endswith=stmnt['person_source_link'])
                        if pq.exists() and pq.count() == 1:
                            person_item = pq.first()
                            st_data = {
                                'speech_type': stmnt['statement_type'],
                                'protocol_url': stmnt['protocol_link']
                            }
                            st_item, st_created = Statement.objects.update_or_create(
                                index=stmnt['index'],
                                person=person_item,
                                step=step_item,
                                defaults=st_data)
                            if st_created:
                                log.msg(u"Created Statement by {} on {}".format(
                                    green(
                                        u'[{}]'.format(person_item.full_name)),
                                    step_item.date),level=log.DEBUG)
                            else:
                                log.msg(u"Updated Statement by {} on {}".format(
                                    green(
                                        u'[{}]'.format(person_item.full_name)),
                                    step_item.date),level=log.DEBUG)
                        else:
                            # We can't save statements if we can't find the
                            # Person
                            log.warning(
                                red(u"Skipping Statement by {}: Person with source_link {} does{} exist{}").format(
                                    green(
                                        u'[{}]'.format(stmnt['person_name'])),
                                    blue(
                                        "[{}]".format(stmnt['person_source_link'])),
                                    red("{}").format(
                                        "" if pq.exists() else " not"),
                                    "" if pq.count() > 1 else ", but {} persons matching found!".format(
                                        pq.count())
                                ))
                            continue
Example #26
0
    def parse(self, response):
        self.SCRAPED_COUNTER += 1

        source_link = response.url
        category = INQUIRY.CATEGORY.xt(response)
        parl_id = response.url.split('/')[-2]
        title = INQUIRY.TITLE.xt(response)
        description = INQUIRY.DESCRIPTION.xt(response)
        sender_objects = []
        callback_requests = []
        ts = GENERIC.TIMESTAMP.xt(response)

        # Inquiries from Bundesrat don't have an LLP => set None
        if("BR" in category):
            LLP = None
        else:
            LLP = LegislativePeriod.objects.get(
                roman_numeral=response.url.split('/')[-4])
        if not self.IGNORE_TIMESTAMP and not self.has_changes(parl_id, LLP, response.url, ts):
            self.logger.debug(
                green(u"[{} of {}] Skipping Inquiry, no changes: {}".format(
                    self.SCRAPED_COUNTER,
                    self.TOTAL_COUNTER,
                    title)))
            return

        # Get or create Category object for the inquiry and log to screen if new
        # category is created.
        cat, created = Category.objects.get_or_create(title=category)
        if created:
            log.debug(u"Created category {}".format(
                green(u'[{}]'.format(category))))

        # An inquiry can have multiple senders, but only a single recipient.
        # Try/catch in case person does not exist in the database.
        try:
            for sender_object in INQUIRY.SENDER.xt(response):
                sender_objects.append(Person.objects.get(
                    parl_id=sender_object))
        except:
            log.warning(red(u'Sender "{}" was not found in database, skipping Inquiry {} in LLP {}'.format(
                INQUIRY.SENDER.xt(response), parl_id, LLP)))
            return
        try:
            receiver_object = Person.objects.get(
                parl_id=INQUIRY.RECEIVER.xt(response))
        except:
            log.warning(red(u'Receiver "{}" was not found in database, skipping Inquiry {} in LLP {}'.format(
                INQUIRY.RECEIVER.xt(response), parl_id, LLP)))
            return

        # Create or update Inquiry item
        inquiry_item, inquiry_created = Inquiry.objects.update_or_create(
            parl_id=parl_id,
            legislative_period=LLP,
            defaults={
                'title': title,
                'source_link': source_link,
                'description': description,
                'receiver': receiver_object,
                'ts': ts
            }
        )

        if inquiry_created:
            inquiry_item.status = 'offen'

        # Attach foreign keys
        inquiry_item.keywords = self.parse_keywords(response)
        inquiry_item.documents = self.parse_docs(response)
        inquiry_item.category = cat
        inquiry_item.sender = sender_objects

        response.meta['inquiry_item'] = inquiry_item

        # Dringliche / Urgent inquiries have a different structure for steps
        # and history. This case distinction accomodates these different
        # structures.
        if any("Dringliche" in '{}'.format(s) for s in inquiry_item.keywords.all()):
            if response.xpath('//h2[@id="tab-ParlamentarischesVerfahren"]'):
                self.parse_parliament_steps(response)
        else:
            response_link = self.parse_steps(response)
            if response_link:
                post_req = scrapy.Request("{}{}".format(BASE_HOST, response_link),
                                          callback=self.parse_inquiry_response,
                                          dont_filter=True)
                post_req.meta['inquiry_item'] = inquiry_item

                callback_requests.append(post_req)

        # Save Inquiry item and log to terminal if created or updated.
        inquiry_item.save()

        if inquiry_created:
            logtext = u"[{} of {}] Created Inquiry {} with ID {}, LLP {} @ {}"
        else:
            logtext = u"[{} of {}] Updated Inquiry {} with ID {}, LLP {} @ {}"

        logtext = logtext.format(
            self.SCRAPED_COUNTER,
            self.TOTAL_COUNTER,
            cyan(title),
            cyan(u"{}".format(parl_id)),
            green(unicode(LLP)),
            blue(response.url),
            green(u"{}".format(inquiry_item.keywords))
        )
        log.msg(logtext, level=log.DEBUG if not self.SCRAPED_COUNTER%1000==0 else log.INFO)

        # log.msg(green("Open Callback requests: {}".format(
        #   len(callback_requests))), level=log.INFO)

        return callback_requests
Example #27
0
    def parse(self, response):
        # Extract fields
        title = LAW.TITLE.xt(response)
        parl_id = LAW.PARL_ID.xt(response)
        ts = GENERIC.TIMESTAMP.xt(response)

        if not (u'BI' in parl_id or u'PET' in parl_id):
            # VBG have their parl_id only in the url
            parl_id = response.url.split('/')[-2]

        status = LAW.STATUS.xt(response)

        raw_llp = response.url.split('/')[-4]
        if raw_llp != u'BR':
            LLP = LegislativePeriod.objects.get(roman_numeral=raw_llp)
        else:
            LLP = None

        if not self.IGNORE_TIMESTAMP and not self.has_changes(
                parl_id, LLP, response.url, ts):
            self.logger.info(
                green(u"Skipping Petition, no changes: {}".format(title)))
            return

        # save ids and stuff for internals
        if LLP not in self.idlist:
            self.idlist[LLP] = {}
        self.idlist[LLP][response.url] = [parl_id, LLP]

        # Extract foreign keys
        category = self.parse_category(response)
        description = LAW.DESCRIPTION.xt(response)

        signing_url, signable = PETITION.SIGNING.xt(response)

        signature_count = PETITION.SIGNATURE_COUNT.xt(response)

        # Parse reference
        reference = self.parse_reference(response)

        # Log our progress
        logtext = u"Scraping {} with id {}, LLP {} @ {}".format(
            red(title), magenta(u"[{}]".format(parl_id)), green(str(LLP)),
            blue(response.url))
        log.msg(logtext, level=log.INFO)

        # Create and save Petition
        petition_item, petition_item_created = Petition.objects.update_or_create(
            parl_id=parl_id,
            legislative_period=LLP,
            defaults={
                'title': title,
                'status': status,
                'source_link': response.url,
                'description': description,
                'signable': signable,
                'signing_url': signing_url,
                'signature_count': signature_count,
                'reference': reference,
                'ts': ts
            })

        if not petition_item_created:
            petition_item.save()

        # Attach foreign keys
        petition_item.keywords = self.parse_keywords(response)
        petition_item.category = category
        petition_item.documents = self.parse_docs(response)

        petition_item.save()

        # Parse creators
        petition_creators = self.parse_creators(response)

        for petition_creator in petition_creators:
            petition_creator.created_petitions.add(petition_item)

        callback_requests = []

        # is the tab 'Parlamentarisches Verfahren available?'
        if response.xpath('//h2[@id="tab-ParlamentarischesVerfahren"]'):
            response.meta['petition_item'] = petition_item
            self.parse_parliament_steps(response)

        # Parse opinions
        opinions = PETITION.OPINIONS.xt(response)

        if opinions:
            for op in opinions:
                if Opinion.objects.filter(parl_id=op['parl_id']).exists():
                    continue
                post_req = scrapy.Request("{}/{}".format(BASE_HOST, op['url']),
                                          callback=self.parse_opinion,
                                          dont_filter=True)
                post_req.meta['petition_item'] = petition_item
                post_req.meta['op_data'] = op

                callback_requests.append(post_req)

        # Only BI or PET (but not PET-BR) have online signatures
        if u'BI' in parl_id or u'PET' in parl_id and not u'PET-BR' in parl_id:
            signatures_base_url = '{}/PAKT/VHG/{}/{}/{}/filter.psp?xdocumentUri=/PAKT/VHG/{}/{}/{}/'\
                'index.shtml&GP_CODE={}&ITYP={}&INR={}&FBEZ=BI_001&R_1000=ALLE&STEP=&pageNumber='

            raw_parl_id = petition_item.parl_id[1:-1].split('/')
            petition_type = raw_parl_id[1]
            petition_number = int(raw_parl_id[0])
            url_parl_id = '{}_{}'.format(petition_type, petition_number)

            signatures_url = signatures_base_url.format(
                BASE_HOST, LLP.roman_numeral, petition_type, url_parl_id,
                LLP.roman_numeral, petition_type, url_parl_id,
                LLP.roman_numeral, petition_type, petition_number)

            post_req = scrapy.Request(signatures_url,
                                      callback=self.parse_signatures,
                                      dont_filter=True)

            post_req.meta['petition_item'] = petition_item

            callback_requests.append(post_req)

        log.msg(green("Open Callback requests: {}".format(
            len(callback_requests))),
                level=log.INFO)

        return callback_requests
Example #28
0
    def parse(self, response):
        source_link = response.url
        category = INQUIRY.CATEGORY.xt(response)
        parl_id = response.url.split('/')[-2]
        title = INQUIRY.TITLE.xt(response)
        description = INQUIRY.DESCRIPTION.xt(response)
        sender_objects = []
        callback_requests = []
        ts = GENERIC.TIMESTAMP.xt(response)

        # Inquiries from Bundesrat don't have an LLP => set None
        if("BR" in category):
            LLP = None
        else:
            LLP = LegislativePeriod.objects.get(
                roman_numeral=response.url.split('/')[-4])
        if not self.has_changes(parl_id, LLP, response.url, ts):
            self.logger.info(
                green(u"Skipping Inquiry, no changes: {}".format(
                    title)))
            return

        # Get or create Category object for the inquiry and log to screen if new
        # category is created.
        cat, created = Category.objects.get_or_create(title=category)
        if created:
            log.msg(u"Created category {}".format(
                green(u'[{}]'.format(category))))

        # An inquiry can have multiple senders, but only a single recipient.
        # Try/catch in case person does not exist in the database.
        try:
            for sender_object in INQUIRY.SENDER.xt(response):
                sender_objects.append(Person.objects.get(
                    parl_id=sender_object))
        except:
            log.msg(red(u'Sender was not found in database, skipping Inquiry {} in LLP {}'.format(
                parl_id, LLP)))
            return
        try:
            receiver_object = Person.objects.get(
                parl_id=INQUIRY.RECEIVER.xt(response))
        except:
            log.msg(red(u'Receiver was not found in database, skipping Inquiry {} in LLP {}'.format(
                parl_id, LLP)))
            return

        # Create or update Inquiry item
        inquiry_item, inquiry_created = Inquiry.objects.update_or_create(
            parl_id=parl_id,
            legislative_period=LLP,
            defaults={
                'title': title,
                'source_link': source_link,
                'description': description,
                'receiver': receiver_object,
                'ts': ts
            }
        )

        # Attach foreign keys
        inquiry_item.keywords = self.parse_keywords(response)
        inquiry_item.documents = self.parse_docs(response)
        inquiry_item.category = cat
        inquiry_item.sender = sender_objects

        response.meta['inquiry_item'] = inquiry_item

        # Dringliche / Urgent inquiries have a different structure for steps
        # and history. This case distinction accomodates these different
        # structures.
        if any("Dringliche" in '{}'.format(s) for s in inquiry_item.keywords.all()):
            if response.xpath('//h2[@id="tab-ParlamentarischesVerfahren"]'):
                self.parse_parliament_steps(response)
        else:
            response_link = self.parse_steps(response)
            if response_link:
                post_req = scrapy.Request("{}{}".format(BASE_HOST, response_link),
                                          callback=self.parse_inquiry_response,
                                          dont_filter=True)
                post_req.meta['inquiry_item'] = inquiry_item

                callback_requests.append(post_req)

        # Save Inquiry item and log to terminal if created or updated.
        inquiry_item.save()

        if inquiry_created:
            logtext = u"Created Inquiry {} with ID {}, LLP {} @ {}"
        else:
            logtext = u"Updated Inquiry {} with ID {}, LLP {} @ {}"

        logtext = logtext.format(
            cyan(title),
            cyan(u"{}".format(parl_id)),
            green(str(LLP)),
            blue(response.url),
            green(u"{}".format(inquiry_item.keywords))
        )
        log.msg(logtext, level=log.INFO)

        log.msg(green("Open Callback requests: {}".format(
            len(callback_requests))), level=log.INFO)

        return callback_requests
Example #29
0
def f_verbose(value):
    if args.verbose:
        print value.replace("[X]", red("[X]")).replace("[*]", green("[*]")).replace("[!]", magenta("[!]"))\
            .replace("safe", blue("safe"))
    return
Example #30
0
def handle_request(name_o, pass_o, verbose):
    headers_['Cookie'] = ""
    global count
    # 1 request
    request_1 = send_request(base_url, headers_, False)

    if 'Set-Cookie' in request_1.headers:
        headers_['Cookie'] = str(
            request_1.headers['Set-Cookie']
        )[:str(request_1.headers['Set-Cookie']).index(";") + 1]

    if 'Location' in request_1.headers:  # check redirect to old auth version
        url_2 = request_1.headers['Location']
        # 2 request
        request_2 = send_request(url_2, headers_, False)
        soup = BeautifulSoup(request_2.text, "lxml")
        items = ["_FORM", "SubmitButton", "FORM_MAC_LIST"]
        url_3 = (str(soup.form['action']))
        url_3 = base_url[:base_url.index("/OA_HTML")] + url_3
        forms = [str(soup.find(id=obj)) for obj in items]

        for index, item in enumerate(forms):  # stripping values
            if "value=" in item:
                forms[index] = item[
                    item.index("value=") + len("value=") +
                    1:item.index("\"",
                                 item.index("value=") + len("value=") + 1)]
            elif "_FORM_SUBMIT_BUTTON" in item:
                forms[index] = item[item.index("_FORM_SUBMIT_BUTTON") +
                                    len("_FORM_SUBMIT_BUTTON':") +
                                    1:item.rindex("'});")]
            else:
                forms[index] = ""

        dictionary = dict(zip(items, forms))
        dictionary['usernameField'] = name_o
        dictionary['passwordField'] = pass_o
        dictionary['_FORM_SUBMIT_BUTTON'] = dictionary.pop('SubmitButton')
        headers_[
            'Cookie'] = headers_['Cookie'] + request_2.headers['Set-Cookie']
        # 3 request
        request_3 = send_request(url_3, headers_, False, dictionary)
        check = request_3.headers['Location'] if request_3.headers[
            'Location'] else ""

        if "errCode=FND_APPL_LOGIN_FAIL" in check:
            if verbose:
                print green('[*]') + "\tStatus: " + green(
                    'not found') + " : " + blue(name_o) + " : " + blue(pass_o)
        else:
            print red('[x]') + "\tStatus: " + red('found') + " : " + blue(
                name_o) + " : " + blue(pass_o)
            count += 1
    else:
        request_2 = send_request(base_url,
                                 headers_,
                                 False,
                                 data={
                                     'username': name_o,
                                     'password': pass_o,
                                     '_lAccessibility': 'N',
                                     'langCode': 'US'
                                 })
        soup = BeautifulSoup(request_2.text, "lxml")
        result = str(soup.p)
        result = result[result.index("status:") + len("status:"):result.index(",", result.index("status:"))]\
            .strip(' ').strip('\'')

        if result == 'success':
            print red('[x]') + "\tStatus:" + red('found') + " : " + blue(
                name_o) + " : " + blue(pass_o)
            count += 1
        elif verbose:
            print green('[*]') + "\tStatus: " + green(
                'not found') + " : " + blue(name_o) + " : " + blue(pass_o)

    return
Example #31
0
    def parse(self, response):
        # Extract fields
        title = LAW.TITLE.xt(response)
        parl_id = LAW.PARL_ID.xt(response)
        ts = GENERIC.TIMESTAMP.xt(response)

        if not (u'BI' in parl_id or u'PET' in parl_id):
            # VBG have their parl_id only in the url
            parl_id = response.url.split('/')[-2]

        status = LAW.STATUS.xt(response)

        raw_llp = response.url.split('/')[-4]
        if raw_llp != u'BR':
            LLP = LegislativePeriod.objects.get(
                roman_numeral=raw_llp)
        else:
            LLP = None

        if not self.IGNORE_TIMESTAMP and not self.has_changes(parl_id, LLP, response.url, ts):
            self.logger.info(
                green(u"Skipping Petition, no changes: {}".format(
                    title)))
            return

        # save ids and stuff for internals
        if LLP not in self.idlist:
            self.idlist[LLP] = {}
        self.idlist[LLP][response.url] = [parl_id, LLP]

        # Extract foreign keys
        category = self.parse_category(response)
        description = LAW.DESCRIPTION.xt(response)

        signing_url, signable = PETITION.SIGNING.xt(response)

        signature_count = PETITION.SIGNATURE_COUNT.xt(response)

        # Parse reference
        reference = self.parse_reference(response)

        # Log our progress
        logtext = u"Scraping {} with id {}, LLP {} @ {}".format(
            red(title),
            magenta(u"[{}]".format(parl_id)),
            green(str(LLP)),
            blue(response.url)
        )
        log.msg(logtext, level=log.INFO)

        # Create and save Petition
        petition_item, petition_item_created = Petition.objects.update_or_create(
            parl_id=parl_id,
            legislative_period=LLP,
            defaults={
                'title': title,
                'status': status,
                'source_link': response.url,
                'description': description,
                'signable': signable,
                'signing_url': signing_url,
                'signature_count': signature_count,
                'reference': reference,
                'ts': ts
            }
        )

        if not petition_item_created:
            petition_item.save()

        # Attach foreign keys
        petition_item.keywords = self.parse_keywords(response)
        petition_item.category = category
        petition_item.documents = self.parse_docs(response)

        petition_item.save()

        # Parse creators
        petition_creators = self.parse_creators(response)

        for petition_creator in petition_creators:
            petition_creator.created_petitions.add(petition_item)

        callback_requests = []

        # is the tab 'Parlamentarisches Verfahren available?'
        if response.xpath('//h2[@id="tab-ParlamentarischesVerfahren"]'):
            response.meta['petition_item'] = petition_item
            self.parse_parliament_steps(response)

        # Parse opinions
        opinions = PETITION.OPINIONS.xt(response)

        if opinions:
            for op in opinions:
                if Opinion.objects.filter(parl_id=op['parl_id']).exists():
                    continue
                post_req = scrapy.Request("{}/{}".format(BASE_HOST, op['url']),
                                          callback=self.parse_opinion,
                                          dont_filter=True)
                post_req.meta['petition_item'] = petition_item
                post_req.meta['op_data'] = op

                callback_requests.append(post_req)

        # Only BI or PET (but not PET-BR) have online signatures
        if u'BI' in parl_id or u'PET' in parl_id and not u'PET-BR' in parl_id:
            signatures_base_url = '{}/PAKT/VHG/{}/{}/{}/filter.psp?xdocumentUri=/PAKT/VHG/{}/{}/{}/'\
                'index.shtml&GP_CODE={}&ITYP={}&INR={}&FBEZ=BI_001&R_1000=ALLE&STEP=&pageNumber='

            raw_parl_id = petition_item.parl_id[1:-1].split('/')
            petition_type = raw_parl_id[1]
            petition_number = int(raw_parl_id[0])
            url_parl_id = '{}_{}'.format(petition_type, petition_number)

            signatures_url = signatures_base_url.format(BASE_HOST, LLP.roman_numeral, petition_type, url_parl_id,
                                                        LLP.roman_numeral, petition_type, url_parl_id,
                                                        LLP.roman_numeral, petition_type, petition_number)

            post_req = scrapy.Request(signatures_url,
                                      callback=self.parse_signatures,
                                      dont_filter=True)

            post_req.meta['petition_item'] = petition_item

            callback_requests.append(post_req)

        log.msg(green("Open Callback requests: {}".format(
            len(callback_requests))), level=log.INFO)

        return callback_requests
Example #32
0
    def parse(self, response):
        # Parse
        parl_id = COMITTEE.url_to_parlid(response.url)[1]
        ts = GENERIC.TIMESTAMP.xt(response)
        LLP = COMITTEE.LLP.xt(response)
        name = COMITTEE.NAME.xt(response)

        if LLP is not None:
            nrbr = 'Nationalrat'
            legislative_period = LegislativePeriod.objects.get(
                roman_numeral=LLP)
            # NR comittees are always "active", only BR comittees are either active or inactive
            active = True
        else:
            nrbr = 'Bundesrat'
            legislative_period = None
            # BR comittees are active if they are not "aufgelöst"
            active = COMITTEE.ACTIVE.xt(response)

        # main-comittee parl_id starts with the number 1
        # sub-comittees parl_id start  with the number 2
        if not parl_id.startswith(u'(1/'):
            try:
                parent_parl_id = u'(1/{}'.format(parl_id.split('/')[1])
                parent_comitee = Comittee.objects.get(
                    parl_id=parent_parl_id, legislative_period=legislative_period)
            except Comittee.DoesNotExist:
                parent_comitee = None
        else:
            parent_comitee = None

        if not self.IGNORE_TIMESTAMP and not self.has_changes(parl_id, legislative_period, nrbr, response.url, ts):
            self.logger.info(
                green(u"Skipping Comittee, no changes: {}".format(
                    name)))
            return

        # Log our progress
        logtext = u"Scraping {} with id {}, LLP {} @ {}".format(
            red(name),
            magenta(u"[{}]".format(parl_id)),
            green(unicode(LLP)),
            blue(response.url)
        )
        log.msg(logtext, level=log.INFO)

        description = COMITTEE.DESCRIPTION.xt(response)

        comittee_data = {
            'description': description,
            'name': name,
            'source_link': response.url,
            'parent_comittee': parent_comitee,
            'ts': ts
        }

        try:
            comittee_item, created_comittee = Comittee.objects.update_or_create(
                parl_id=parl_id,
                legislative_period=legislative_period,
                nrbr=nrbr,
                active=active,
                defaults=comittee_data
            )
        except:
            log.msg(
                u"Could not update/create Comittee {}".format(name),
                level=log.ERROR)
            return
            # import ipdb
            # ipdb.set_trace()

        meetings = COMITTEE.MEETINGS.xt(response)

        comittee_laws = []

        for meeting in meetings:
            agenda_data = meeting['agenda']
            if agenda_data is not None:
                agenda_item, agenda_created = Document.objects.get_or_create(
                    **agenda_data)
            else:
                agenda_item = None

            meeting_data = {
                'agenda': agenda_item
            }

            # Log our progress
            logtext = u"Scraping meeting no. {} of {} on {}".format(
                red(meeting['number']),
                magenta(name),
                green(str(meeting['date'].date())),
            )
            log.msg(logtext, level=log.INFO)

            meeting_item, meeting_created = ComitteeMeeting.objects.update_or_create(
                number=meeting['number'],
                date=meeting['date'],
                comittee=comittee_item,
                defaults=meeting_data
            )

            for topic in meeting['topics']:
                if topic['law'] is not None:
                    law = topic['law']
                    law_item = self.parse_law(law)
                    if law_item is not None:
                        comittee_laws.append(law_item)
                else:
                    law_item = None

                agenda_topic_data = {
                    'comment': topic['comment'],
                    'law': law_item,
                }

                agenda_topic_item, agenda_topic_created = ComitteeAgendaTopic.objects.update_or_create(
                    number=topic['number'],
                    meeting=meeting_item,
                    text=topic['text'],
                    defaults=agenda_topic_data,
                )

        # parse Verhandlungsgegenstaende and Veroeffentlichungen
        laws_and_reports = COMITTEE.LAWS.xt(response)

        for law in laws_and_reports:
            # Log our progress
            logtext = u"Adding law with id {}, LLP {} to {}".format(
                magenta(u"[{}]".format(law['parl_id'])),
                green(law['llp']),
                blue(name)
            )
            log.msg(logtext, level=log.INFO)

            law_item = self.parse_law(law)
            if law_item is not None:
                comittee_laws.append(law_item)

        comittee_item.laws.add(*comittee_laws)
        comittee_item.save()
Example #33
0
    def parse(self, response):
        # Extract fields
        ts = GENERIC.TIMESTAMP.xt(response)
        title = LAW.TITLE.xt(response)
        parl_id = LAW.PARL_ID.xt(response)
        LLP = LegislativePeriod.objects.get(
            roman_numeral=response.url.split('/')[-4])

        if not self.IGNORE_TIMESTAMP and not self.has_changes(
                parl_id, LLP, response.url, ts):
            self.logger.info(
                green(u"Skipping Law, no changes: {}".format(title)))
            return

        # save ids and stuff for internals
        if LLP not in self.idlist:
            self.idlist[LLP] = {}
        self.idlist[LLP][response.url] = [parl_id, LLP]

        # Extract foreign keys
        category = self.parse_category(response)
        description = PRELAW.DESCRIPTION.xt(response)

        # Log our progress

        logtext = u"Scraping {} with id {}, LLP {} @ {}".format(
            red(title), magenta(u"[{}]".format(parl_id)), green(unicode(LLP)),
            blue(response.url))
        log.msg(logtext, level=log.INFO)

        # Create and save Law
        pre_law_data = {
            'title': title,
            'description': description,
            'source_link': response.url,
            'ts': ts
        }
        law_item, created = Law.objects.get_or_create(parl_id=parl_id,
                                                      legislative_period=LLP,
                                                      defaults=pre_law_data)

        if not created:
            law_item.save()

        # Attach foreign keys
        law_item.keywords = self.parse_keywords(response)
        law_item.category = category
        law_item.documents = self.parse_docs(response)

        law_item.save()

        # Parse opinions
        opinions = PRELAW.OPINIONS.xt(response)

        callback_requests = []

        # is the tab 'Parlamentarisches Verfahren available?'
        if opinions:
            skipped_ops = 0
            for op in opinions:
                if Opinion.objects.filter(parl_id=op['parl_id']).exists():
                    skipped_ops += 1
                    continue
                post_req = scrapy.Request("{}/{}".format(BASE_HOST, op['url']),
                                          callback=self.parse_opinion,
                                          dont_filter=True)
                post_req.meta['law_item'] = law_item
                post_req.meta['op_data'] = op

                callback_requests.append(post_req)

            log.msg(green("Open/Skipped Callback requests: {}/{}".format(
                len(callback_requests), skipped_ops)),
                    level=log.INFO)

        return callback_requests
Example #34
0
    def parse(self, response):
        self.SCRAPED_COUNTER += 1

        LLP = LegislativePeriod.objects.get(
            roman_numeral=response.url.split('/')[-4])

        # Extract fields
        ts = GENERIC.TIMESTAMP.xt(response)
        title = LAW.TITLE.xt(response)
        parl_id = LAW.PARL_ID.xt(response)
        status = LAW.STATUS.xt(response)

        if not self.IGNORE_TIMESTAMP and not self.has_changes(
                parl_id, LLP, response.url, ts):
            self.logger.info(
                green(u"[{} of {}] Skipping Law, no changes: {}".format(
                    self.SCRAPED_COUNTER, self.TOTAL_COUNTER, title)))
            return

        # Extract foreign keys
        category = LAW.CATEGORY.xt(response)
        description = LAW.DESCRIPTION.xt(response)

        # Create category if we don't have it yet
        cat, created = Category.objects.get_or_create(title=category)
        if created:
            log.msg(u"Created category {}".format(
                green(u'[{}]'.format(category))))

        # Create and save Law
        law_data = {
            'title': title,
            'status': status,
            'description': description,
            'ts': ts,
            'source_link': response.url,
        }
        law_item, law_created = Law.objects.update_or_create(
            parl_id=parl_id, legislative_period=LLP, defaults=law_data)

        # Attach foreign keys
        law_item.keywords = self.parse_keywords(response)
        law_item.category = cat
        law_item.documents = self.parse_docs(response)

        law_item.save()

        # Log our progress
        if law_created:
            logtext = u"[{} of {}] Created {} with id {}, LLP {} @ {}"
        else:
            logtext = u"[{} of {}] Updated {} with id {}, LLP {} @ {}"

        logtext = logtext.format(self.SCRAPED_COUNTER, self.TOTAL_COUNTER,
                                 red(title), cyan(u"[{}]".format(parl_id)),
                                 green(unicode(LLP)), blue(response.url))
        log.msg(logtext, level=log.INFO)

        response.meta['law_item'] = law_item

        # is the tab 'Parlamentarisches Verfahren available?'
        if response.xpath('//h2[@id="tab-ParlamentarischesVerfahren"]'):
            self.parse_parliament_steps(response)

        if response.xpath('//h2[@id="tab-VorparlamentarischesVerfahren"]'):
            self.parse_pre_parliament_steps(response)
Example #35
0
    def parse(self, response):
        # Extract fields
        ts = GENERIC.TIMESTAMP.xt(response)
        title = LAW.TITLE.xt(response)
        parl_id = LAW.PARL_ID.xt(response)
        LLP = LegislativePeriod.objects.get(
            roman_numeral=response.url.split('/')[-4])

        if not self.IGNORE_TIMESTAMP and not self.has_changes(parl_id, LLP, response.url, ts):
            self.logger.info(
                green(u"Skipping Law, no changes: {}".format(
                    title)))
            return

        # save ids and stuff for internals
        if LLP not in self.idlist:
            self.idlist[LLP] = {}
        self.idlist[LLP][response.url] = [parl_id, LLP]

        # Extract foreign keys
        category = self.parse_category(response)
        description = PRELAW.DESCRIPTION.xt(response)

        # Log our progress
        logtext = u"Scraping {} with id {}, LLP {} @ {}".format(
            red(title),
            magenta(u"[{}]".format(parl_id)),
            green(str(LLP)),
            blue(response.url)
        )
        log.msg(logtext, level=log.INFO)

        # Create and save Law
        pre_law_data = {
            'title': title,
            'description': description,
            'ts': ts
        }
        law_item, created = Law.objects.get_or_create(
            parl_id=parl_id,
            source_link=response.url,
            legislative_period=LLP,
            defaults=pre_law_data)

        if not created:
            law_item.save()

        # Attach foreign keys
        law_item.keywords = self.parse_keywords(response)
        law_item.category = category
        law_item.documents = self.parse_docs(response)

        law_item.save()

        # Parse opinions
        opinions = PRELAW.OPINIONS.xt(response)

        callback_requests = []

        # is the tab 'Parlamentarisches Verfahren available?'
        if opinions:
            skipped_ops = 0
            for op in opinions:
                if Opinion.objects.filter(parl_id=op['parl_id']).exists():
                    skipped_ops += 1
                    continue
                post_req = scrapy.Request("{}/{}".format(BASE_HOST, op['url']),
                                          callback=self.parse_opinion,
                                          dont_filter=True)
                post_req.meta['law_item'] = law_item
                post_req.meta['op_data'] = op

                callback_requests.append(post_req)

            log.msg(green("Open/Skipped Callback requests: {}/{}".format(
                len(callback_requests), skipped_ops)), level=log.INFO)

        return callback_requests
    def parse(self, response):
        self.SCRAPED_COUNTER += 1

        LLP = LegislativePeriod.objects.get(
            roman_numeral=response.url.split('/')[-4])

        # Extract fields
        ts = GENERIC.TIMESTAMP.xt(response)
        title = LAW.TITLE.xt(response)
        parl_id = LAW.PARL_ID.xt(response)
        status = LAW.STATUS.xt(response)

        if not self.IGNORE_TIMESTAMP and not self.has_changes(parl_id, LLP, response.url, ts):
            self.logger.info(
                green(u"[{} of {}] Skipping Law, no changes: {}".format(
                    self.SCRAPED_COUNTER,
                    self.TOTAL_COUNTER,
                    title)))
            return

        # Extract foreign keys
        category = LAW.CATEGORY.xt(response)
        description = LAW.DESCRIPTION.xt(response)

        # Create category if we don't have it yet
        cat, created = Category.objects.get_or_create(title=category)
        if created:
            log.msg(u"Created category {}".format(
                green(u'[{}]'.format(category))))

        # Create and save Law
        law_data = {
            'title': title,
            'status': status,
            'description': description,
            'ts': ts,
            'source_link': response.url,
        }
        law_item, law_created = Law.objects.update_or_create(
            parl_id=parl_id,
            legislative_period=LLP,
            defaults=law_data)

        # Attach foreign keys
        law_item.keywords = self.parse_keywords(response)
        law_item.category = cat
        law_item.documents = self.parse_docs(response)

        law_item.save()

        # Log our progress
        if law_created:
            logtext = u"[{} of {}] Created {} with id {}, LLP {} @ {}"
        else:
            logtext = u"[{} of {}] Updated {} with id {}, LLP {} @ {}"

        logtext = logtext.format(
            self.SCRAPED_COUNTER,
            self.TOTAL_COUNTER,
            red(title),
            cyan(u"[{}]".format(parl_id)),
            green(unicode(LLP)),
            blue(response.url)
        )
        log.msg(logtext, level=log.INFO)

        response.meta['law_item'] = law_item

        # is the tab 'Parlamentarisches Verfahren available?'
        if response.xpath('//h2[@id="tab-ParlamentarischesVerfahren"]'):
            self.parse_parliament_steps(response)

        if response.xpath('//h2[@id="tab-VorparlamentarischesVerfahren"]'):
            self.parse_pre_parliament_steps(response)
Example #37
0
    def parse(self, response):
        # Parse
        parl_id = COMITTEE.url_to_parlid(response.url)[1]
        ts = GENERIC.TIMESTAMP.xt(response)
        llp = COMITTEE.LLP.xt(response)
        name = COMITTEE.NAME.xt(response)

        if llp is not None:
            nrbr = 'Nationalrat'
            legislative_period = LegislativePeriod.objects.get(
                roman_numeral=llp)
            # NR comittees are "active" if they are in the current LLP
            active = (
                legislative_period == LegislativePeriod.objects.get_current())
        else:
            nrbr = 'Bundesrat'
            legislative_period = None
            # BR comittees are active if they are not "aufgelöst"
            active = COMITTEE.ACTIVE.xt(response)

        # main-comittee parl_id starts with the number 1
        # sub-comittees parl_id start  with the number 2
        if not parl_id.startswith(u'(1/'):
            try:
                parent_parl_id = u'(1/{}'.format(parl_id.split('/')[1])
                parent_comitee = Comittee.objects.get(
                    parl_id=parent_parl_id, legislative_period=legislative_period)
            except Comittee.DoesNotExist:
                parent_comitee = None
        else:
            parent_comitee = None

        if not self.IGNORE_TIMESTAMP and not self.has_changes(parl_id, legislative_period, nrbr, response.url, ts):
            self.logger.info(
                green(u"Skipping Comittee, no changes: {}".format(
                    name)))
            return

        # Log our progress
        logtext = u"Scraping {} with id {}, LLP {} @ {}".format(
            red(name),
            magenta(u"[{}]".format(parl_id)),
            green(unicode(LLP)),
            blue(response.url)
        )
        log.msg(logtext, level=log.INFO)

        description = COMITTEE.DESCRIPTION.xt(response)

        comittee_data = {
            'description': description,
            'name': name,
            'source_link': response.url,
            'parent_comittee': parent_comitee,
            'active': active,
            'ts': ts
        }

        try:
            comittee_item, created_comittee = Comittee.objects.update_or_create(
                parl_id=parl_id,
                legislative_period=legislative_period,
                nrbr=nrbr,
                defaults=comittee_data
            )
        except:
            log.msg(
                u"Could not update/create Comittee {}".format(name),
                level=log.ERROR)
            return
            # import ipdb
            # ipdb.set_trace()

        meetings = COMITTEE.MEETINGS.xt(response)

        comittee_laws = []

        for meeting in meetings:
            agenda_data = meeting['agenda']
            if agenda_data is not None:
                agenda_item, agenda_created = Document.objects.get_or_create(
                    **agenda_data)
            else:
                agenda_item = None

            meeting_data = {
                'agenda': agenda_item
            }

            # Log our progress
            logtext = u"Scraping meeting no. {} of {} on {}".format(
                red(meeting['number']),
                magenta(name),
                green(str(meeting['date'].date())),
            )
            log.msg(logtext, level=log.INFO)

            meeting_item, meeting_created = ComitteeMeeting.objects.update_or_create(
                number=meeting['number'],
                date=meeting['date'],
                comittee=comittee_item,
                defaults=meeting_data
            )

            for topic in meeting['topics']:
                if topic['law'] is not None:
                    law = topic['law']
                    law_item = self.parse_law(law)
                    if law_item is not None:
                        comittee_laws.append(law_item)
                else:
                    law_item = None

                agenda_topic_data = {
                    'comment': topic['comment'],
                    'law': law_item,
                }

                agenda_topic_item, agenda_topic_created = ComitteeAgendaTopic.objects.update_or_create(
                    number=topic['number'],
                    meeting=meeting_item,
                    text=topic['text'],
                    defaults=agenda_topic_data,
                )

        # parse Verhandlungsgegenstaende and Veroeffentlichungen
        laws_and_reports = COMITTEE.LAWS.xt(response)

        for law in laws_and_reports:
            # Log our progress
            logtext = u"Adding law with id {}, LLP {} to {}".format(
                magenta(u"[{}]".format(law['parl_id'])),
                green(law['llp']),
                blue(name)
            )
            log.msg(logtext, level=log.INFO)

            law_item = self.parse_law(law)
            if law_item is not None:
                comittee_laws.append(law_item)

        comittee_item.laws.add(*comittee_laws)
        comittee_item.save()
Example #38
0
def download_all(files):
    from .Download import download_file
    urlprefix = "http://www.naturalearthdata.com/http//www.naturalearthdata.com/download/10m/cultural/"
    print(blue("Downloading Natural Earth files...", bold=True))
    for file in files:
        download_file(file, urlprefix + file)
Example #39
0
    def parse_inquiry_response(self, response):
        """
        Callback function for parsing the inquiry responses
        """
        inquiry_item = response.meta['inquiry_item']
        source_link = response.url
        parl_id = response.url.split('/')[-2]
        title = INQUIRY.TITLE.xt(response)
        description = INQUIRY.RESPONSEDESCRIPTION.xt(response)
        LLP = inquiry_item.legislative_period
        category = INQUIRY.CATEGORY.xt(response)

        # Get or create Category object for the inquiry and log to screen if new
        # category is created.
        cat, created = Category.objects.get_or_create(title=category)
        if created:
            log.msg(u"Created category {}".format(
                green(u'[{}]'.format(category))))

        try:
            sender_object = Person.objects.get(
                parl_id=INQUIRY.RESPONSESENDER.xt(response))
        except:
            log.msg(red(u'Receiver was not found in database, skipping Inquiry {} in LLP {}'.format(
                parl_id, LLP)))
            return

        # Create or update Inquiry item
        inquiryresponse_item, inquiryresponse_created = InquiryResponse.objects.update_or_create(
            parl_id=parl_id,
            legislative_period=LLP,
            defaults={
                'title': title,
                'source_link': source_link,
                'description': description,
                'sender': sender_object
            }
        )

        # Attach foreign Keys
        inquiryresponse_item.documents = self.parse_docs(response)
        inquiryresponse_item.category = cat

        # Save InquiryResponse object
        inquiryresponse_item.save()

        if inquiryresponse_created:
            logtext = u"Created InquiryResponse {} with ID {}, LLP {} @ {}"
        else:
            logtext = u"Updated InquiryResponse {} with ID {}, LLP {} @ {}"

        logtext = logtext.format(
            cyan(title),
            cyan(u"{}".format(parl_id)),
            green(str(LLP)),
            blue(response.url)
        )
        log.msg(logtext, level=log.INFO)

        inquiry_item.response = inquiryresponse_item
        inquiry_item.save()

        return