Python get_or_create Examples, scraper.utils.get_or_create Python Examples

Example #1

0

Show file

    def fetch_list(klass, cache=False, sync=False):
        for a, url in enumerate(('http://www.lachambre.be/kvvcr/showpage.cfm?section=none&language=fr&cfm=/site/wwwcfm/rajv/rajvlist.cfm?lastreports=y',
                             'http://www.lachambre.be/kvvcr/showpage.cfm?section=none&language=fr&cfm=/site/wwwcfm/rajv/rajvlist.cfm?lastreports=n')):
            soup, suppe = scraper.get_with_nl(url, "annual repports %i" % a)

            for i, j in zip(soup.find('div', id="story")('table')[1].tbody('tr', recursive=False)[::5], suppe.find('div', id="story")('table')[1].tbody('tr', recursive=False)[::5]):
                get_or_create(AnnualReport,
                              title={"fr": i('td')[2].text, "nl": j('td')[2].text},
                              date=i('td')[0].text,
                              law_and_article={"fr": i('td')[4].text, "nl": j('td')[4].text},
                              periodicity=re.sub("[^0-9]", "", i('td')[5].text),
                              pdf_url=i('td')[1].a["href"] if i('td')[1].a else "",
                              )

Example #2

0

Show file

    def fetch_list(klass, cache=False, sync=False):
        for document_page in scraper.get("http://www.lachambre.be/kvvcr/showpage.cfm?section=/flwb&language=fr&rightmenu=right&cfm=ListDocument.cfm", "all documents")('div', **{'class': re.compile("linklist_[01]")}):
            soup, suppe = scraper.get_with_nl(LACHAMBRE_PREFIX + document_page.a["href"], "document %s" % document_page.a.text)
            for soup, suppe in zip(soup.table('tr'), suppe.table('tr')):
                get_or_create(Document, _id="lachambre_id", title={"fr": soup('div')[1].text, "nl": suppe('div')[1].text}, lachambre_id=soup.div.text, url=soup.a["href"])

        # list otherwise mongodb will timeout if we stay in a query mode
        for document in list(Document.objects.filter(done=False)):
            if document.lachambre_id == 25:
                continue
            try:
                klass.fetch_one(document)
            except Exception, e:
                traceback.print_exc(file=sys.stdout)
                logger.error("/!\ %s didn't succed! Error: while reparsing document %s" % (document.lachambre_id, e))

Example #3

0

Show file

 def fetch_one(klass, soup, legislation, cache=False, sync=False):
     if legislation == 54:
         get_or_create(WrittenQuestionBulletin,
                       legislature="53",
                       lachambre_id=soup('td')[0]('a')[-1].text.split()[-1],
                       date=soup('td')[2].text,
                       publication_date=soup('td')[3].text,
                       url=soup('td')[1].a["href"],
                       pdf_url=soup('td')[0].a["href"],
                       )
     else:
         get_or_create(WrittenQuestionBulletin,
                       legislature=str(legislation),
                       lachambre_id=soup('td')[0]('a')[-1].text.split()[-1],
                       publication_date=soup('td')[2].text,
                       url=soup('td')[1].a["href"] if soup('td')[1].a else None,
                       pdf_url=soup('td')[0].a["href"],
                       )
         logger.debug("%s" % soup('td')[0]('a')[-1].text.split()[-1])

Example #4

0

Show file

 def _get_competences(dico, dico_nl, document):
     # FIXME: meh, DRY
     if dico.get(u"Compétence") and dico_nl.get(u"Bevoegdheid"):
         document.timeline = []
         for (_date, _title), (_, _title_nl) in zip([clean_text(x).split(u" \xa0 ", 1) for x in dico[u"Compétence"]["head"].contents[::2]],
                                                    [clean_text(x).split(u" \xa0 ", 1) for x in dico_nl[u"Bevoegdheid"]["head"].contents[::2]]):
             logger.debug("append time line %s %s %s" % (_date, _title, _title_nl))
             document.timeline.append(DocumentTimeLine.objects.create(title={"fr": _title, "nl": _title_nl}, date=_date))
     elif dico.get(u"Compétence"):
         document.timeline = []
         for (_date, _title) in [clean_text(x).split(u" \xa0 ", 1) for x in dico[u"Compétence"]["head"].contents[::2]]:
             logger.debug("append time line %s %s %s" % (_date, _title, ""))
             document.timeline.append(DocumentTimeLine.objects.create(title={"fr": _title, "nl": ""}, date=_date))
     elif dico_nl.get(u"Bevoegdheid"):
         document.timeline = []
         for (_date, _title_nl) in [clean_text(x).split(u" \xa0 ", 1) for x in dico_nl[u"Bevoegdheid"]["head"].contents[::2]]:
             logger.debug("append time line %s %s %s" % (_date, "", _title_nl))
             document.timeline.append(DocumentTimeLine.objects.create(title={"fr": "", "nl": _title_nl}, date=_date))
     if dico.get("Analyse des interventions"):
         document.analysis = get_or_create(Analysis, _id="lachambre_id", lachambre_id=dico["Analyse des interventions"]["head"].a.text, url=dico["Analyse des interventions"]["head"].a["href"])

Example #5

0

Show file

    def fetch_one(klass, commission, cache=False, sync=False):
        soup, suppe = scraper.get_with_nl(LACHAMBRE_PREFIX + commission.url, "commission %s" % commission.lachambre_id)
        commission.full_name["fr"] = soup.h1.text
        commission.full_name["nl"] = suppe.h1.text
        commission.deputies = []
        seats = {"fr": {}, "nl": {}}
        for i, j in zip(soup('p')[2:], suppe('p')[2:]):
            role = i.b.text[:-1]
            role_nl = j.b.text[:-1]
            for dep in i('a'):
                deputy = Deputy.objects.get(lachambre_id=re.search("key=([O0-9]+)", dep["href"]).groups()[0])
                membership = get_or_create(CommissionMembership, deputy=deputy, commission=commission)
                membership.role = role
                membership.save()
                commission.deputies.append(membership.id)
            seats["fr"][role] = map(lambda x: (x[0], len(x[1].split(','))), zip(map(lambda x: x.text[:-1], i('b')[1:]), str(i).split("<br />")[1:]))
            seats["nl"][role_nl] = map(lambda x: (x[0], len(x[1].split(','))), zip(map(lambda x: x.text[:-1], i('b')[1:]), str(i).split("<br />")[1:]))

        commission.seats = seats
        commission.save()

Example #6

0

Show file

    def fetch_list(klass, cache=False, sync=False):
        soup, suppe = scraper.get_with_nl("http://www.lachambre.be/kvvcr/showpage.cfm?section=/comm/commissions&language=fr&cfm=/site/wwwcfm/comm/LstCom.cfm&rightmenu=right_cricra", "commissions list")
        _type = ""
        for i, j in zip(soup("div", id="story")[1], suppe("div", id="story")[1]):
            if not isinstance(i, NavigableString) and (i.h4 or i.a):
                if i.h4:
                    _type = i.h4.text
                    _type_nl = j.h4.text
                elif i.a:
                    commission = get_or_create(Commission, lachambre_id=int(re.search("com=(\d+)", i.a["href"]).groups()[0]))
                    commission.type["fr"] = _type
                    commission.type["nl"] = _type_nl
                    commission.name["fr"] = i.a.text
                    commission.name["nl"] = j.a.text
                    commission.url = i.a["href"]

                    commission.save()

        for com in list(Commission.objects.all()):
            klass.fetch_one(com, cache=cache, sync=sync)

Example #7

0

Show file

    def fetch_list(klass, cache=False, sync=False):
        soup = scraper.get("http://www.lachambre.be/kvvcr/showpage.cfm?section=/depute&language=fr&rightmenu=right_depute&cfm=/site/wwwcfm/depute/cvlist.cfm", "deputies", cache=cache, sync=sync)

        for dep in soup.table('tr'):
            items = dep('td')
            url = items[0].a['href']
            lachambre_id = re.search('key=([0-9O]+)', url).groups()[0]

            deputy = Deputy.objects.filter(lachambre_id=lachambre_id)
            full_name = re.sub(" +", " ", items[0].a.text.strip())

            if not deputy:
                logger.info("[NEW] deputy: %s" % full_name)
            deputy = deputy[0] if deputy else Deputy(lachambre_id=lachambre_id)

            if items[1].a.text.strip():
                deputy.party = get_or_create(Party, name=items[1].a.text.strip(), url=dict(items[1].a.attrs)['href'])

            email = items[2].a.text
            website = items[3].a['href'] if items[3].a else None

            if email not in deputy.emails:
                deputy.emails.append(email)
            if website not in deputy.websites:
                deputy.websites.append(website)

            deputy.full_name = full_name
            deputy.url = url
            deputy.save()

        tasks = []

        for index, deputy in enumerate(list(Deputy.objects.all())):
            logger.debug("%s %s" % (index, deputy.full_name))
            if sync:
                klass.fetch_one(deputy, cache=cache, sync=sync)
            else:
                klass.fetch_one.delay(deputy, cache=cache, sync=sync)

        if not sync:
            map(lambda x: x.get, tasks)