Esempio n. 1
0
    def save_item(item: Item, spider):
        if isinstance(item, MessageItem):

            def message_is_unique(message_model: Message, limit=20) -> bool:
                subquery = session.query(Message) \
                    .order_by(Message.created_at.desc(), Message.id.desc()) \
                    .limit(limit) \
                    .subquery()

                alias = aliased(Message, subquery)

                return not session.query(
                    session.query(alias).filter(
                        alias.text == message_model.text, alias.image
                        == message_model.image).exists()).scalar()

            def message_fit_the_length(message_model: Message) -> bool:
                if message_model.image:
                    return len(remove_tags(message_model.text)) <= 1024
                return len(remove_tags(message_model.text)) <= 4096

            message = Message(text=item.get('text'),
                              image=item.get('image'),
                              url=item.get('url'))

            if message_is_unique(message) and message_fit_the_length(message):
                with session.begin():
                    session.add(message)
Esempio n. 2
0
 def normalize(item: Item) -> School:
     return School(name=item.get('name'),
                   id='SH-{}'.format(item.get('Dienststellennummer')),
                   address=item.get('Strasse'),
                   zip=item.get("Postleitzahl"),
                   city=item.get("Ort"),
                   email=item.get('E-Mail'),
                   school_type=item.get('Schularten'),
                   fax=item.get('Fax'),
                   phone=item.get('Telefon'),
                   director=item.get('Schulleitung'))
Esempio n. 3
0
 def normalize(item: Item) -> School:
     return School(name=item.get('name'),
                   id='RP-{}'.format(item.get('id')),
                   address=item.get('Adresse'),
                   city=re.split('\d{5}',
                                 item.get('Ort').strip())[1].strip(),
                   zip=re.findall('\d{5}', item.get('Ort'))[0],
                   website=item.get('Internet'),
                   email=item.get('E-Mail'),
                   school_type=item.get('Schulform'),
                   fax=item.get('Fax'),
                   phone=item.get('Telefon'))
Esempio n. 4
0
def on_duplicate_sql(*args, item: scrapy.Item):
    if args and isinstance(item, scrapy.Item):
        dup_keys = list()

        for index, key in enumerate(args):
            if index == 0:
                update_str = '  ON DUPLICATE KEY UPDATE {}="{}"'.format(key, item.get(key))
            else:
                update_str = '{}="{}"'.format(key, item.get(key))
            dup_keys.append(update_str)
        return ', '.join(dup_keys)
    else:
        return ''
Esempio n. 5
0
 def normalize(item: Item) -> School:
     tel = item.get('telefon')
     return School(name=item.get('name'),
                   phone=tel,
                   fax=item.get('telefax'),
                   website=item.get('homepage'),
                   email=item.get('e-mail'),
                   address=item.get('straße'),
                   city=item.get('ort'),
                   zip=item.get('plz'),
                   school_type=item.get('schultyp'),
                   director=item.get('schulleitung'),
                   id='SL-{}'.format(tel.replace(" ", "-")))
Esempio n. 6
0
 def normalize(item: Item) -> School:
     return School(name=item.get('name'),
                   phone=item.get('telefon'),
                   fax=item.get('fax'),
                   website=item.get('homepage'),
                   address=item.get('straße'),
                   city=item.get('ort'),
                   zip=item.get('plz'),
                   school_type=item.get('schultyp'),
                   id='HE-{}'.format(item.get('id')))
 def normalize(self, item: Item) -> School:
     return School(name=item.get('name'),
                   id='RP-{}'.format(item.get('id')),
                   address=item.get('Adresse'),
                   city=item.get('Ort'),
                   website=item.get('Internet'),
                   email=item.get('E-Mail'),
                   school_type=item.get('Schulform'),
                   fax=item.get('Fax'),
                   phone=item.get('Telefon'))
Esempio n. 8
0
 def normalize(item: Item) -> School:
     city_parts = item.get('Ort').split()
     zip, city = city_parts[0], ' '.join(city_parts[1:])
     return School(name=item.get('Schulname'),
                   id='TH-{}'.format(item.get('Schulnummer')),
                   address=item.get('Straße'),
                   zip=zip,
                   city=city,
                   website=item.get('Internet'),
                   email=ThueringenSpider._deobfuscate_email(item.get('E-Mail')),
                   school_type=item.get('Schulart'),
                   provider=item.get('Schulträger'),
                   fax=item.get('Telefax'),
                   phone=item.get('Telefon'))
Esempio n. 9
0
 def normalize(item: Item) -> School:
     city_parts = item.get('adresse_ort').split()
     zip_code, city = city_parts[0], city_parts[1:]
     return School(name=item.get('schulname'),
                   id='HH-{}'.format(item.get('schul_id')),
                   address=item.get('adresse_strasse_hausnr'),
                   address2='',
                   zip=zip_code,
                   city=' '.join(city),
                   website=item.get('schul_homepage'),
                   email=item.get('schul_email'),
                   school_type=item.get('schulform'),
                   fax=item.get('fax'),
                   phone=item.get('schul_telefonnr'),
                   director=item.get('name_schulleiter'))
Esempio n. 10
0
 def normalize(item: Item) -> School:
     zip_code, *city_parts = item.get('city').split()
     return School(name=item.get('name'),
                   phone=item.get('phone'),
                   fax=item.get('fax'),
                   website=item.get('web'),
                   address=item.get('street'),
                   city=' '.join(city_parts),
                   zip=zip_code,
                   school_type=item.get('school_type'),
                   legal_status=item.get('type'),
                   id='BY-{}'.format(item.get('number')))
 def normalize(item: Item) -> School:
     return School(name=item.get('Name'),
                   id = 'SA-{}'.format(item.get('ID')),
                   address=re.split('\d{5}', item.get('Adresse').strip())[0].strip(),
                   zip=re.findall('\d{5}', item.get('Adresse').strip())[0],
                   city=re.split('\d{5}', item.get('Adresse').strip())[1].strip(),
                  # address=item.get('Adresse'),
                   website=item.get('Homepage'),
                   email=item.get('E-Mail'),
                   fax=item.get('Telefax'),
                   phone=item.get('Telefon'),
                   )
 def normalize(self, item: Item) -> School:
     city_parts = item.get('Ort').split(' ', 1)
     zip, city = city_parts[0], city_parts[1]
     return School(name=item.get('name'),
                   id='RP-{}'.format(item.get('id')),
                   address=item.get('Adresse'),
                   zip=zip,
                   city=city,
                   website=item.get('Internet'),
                   email=item.get('E-Mail'),
                   school_type=item.get('Schulform'),
                   fax=item.get('Fax'),
                   phone=item.get('Telefon'))
Esempio n. 13
0
 def normalize(item: Item) -> School:
     *name, street, place = item.get('Adresse')
     zip_code, *city_parts = place.split(" ")
     return School(name=' '.join(name),
                     id='BB-{}'.format(item.get('id')),
                     address=street,
                     zip=zip_code,
                     city=' '.join(city_parts),
                     website=first_or_none(item.get('Internet')),
                     email=first_or_none(item.get('E-Mail')),
                     school_type=first_or_none(item.get('Schulform')),
                     provider=first_or_none(item.get('Schulamt')),
                     fax=first_or_none(item.get('Fax')),
                     phone=first_or_none(item.get('Telefon')),
                     director=first_or_none(item.get('Schulleiter/in')))
Esempio n. 14
0
 def normalize(item: Item) -> School:
     name = "".join([
         item.get("Schulbezeichnung_1", ""),
         item.get("Schulbezeichnung_2", ""),
         item.get("Schulbezeichnung_3", "")
     ])
     helper = NordRheinWestfalenHelper()
     return School(
         name=name,
         id='NW-{}'.format(item.get('Schulnummer')),
         address=item.get('Strasse'),
         zip=item.get("PLZ"),
         city=item.get('Ort'),
         website=item.get('Homepage'),
         email=item.get('E-Mail'),
         legal_status=helper.resolve('rechtsform', item.get('Rechtsform')),
         school_type=helper.resolve('schulform', item.get('Schulform')),
         fax=f"{item.get('Faxvorwahl')}{item.get('Fax')}",
         phone=f"{item.get('Telefonvorwahl')}{item.get('Telefon')}")
 def normalize(item: Item) -> School:
     city_parts = item.get('Ort').split()
     zip, city = city_parts[0], ' '.join(city_parts[1:])
     return School(name=item.get('Schule'),
                   phone=item.get('Tel'),
                   fax=None,
                   email=item.get('E-Mail'),
                   website=item.get('Homepage'),
                   address=item.get('Straße'),
                   zip=zip,
                   city=city,
                   school_type=item.get("Schul-gliederung(en)"),
                   id='NI-{}'.format(item.get('Schulnummer')))
Esempio n. 16
0
 def normalize(item: Item) -> School:
     ansprechpersonen = item['Ansprechperson'].replace(
         'Schulleitung:', '').replace('Vertretung:', ',').split(',')
     item['Schulleitung'] = ansprechpersonen[0]
     item['Vertretung'] = ansprechpersonen[1]
     return School(name=item.get('name'),
                   id='HB-{}'.format(item.get('id')),
                   address=re.split(
                       '\d{5}',
                       item.get('Anschrift:').strip())[0].strip(),
                   zip=re.findall('\d{5}',
                                  item.get('Anschrift:').strip())[0],
                   city=re.split('\d{5}',
                                 item.get('Anschrift:').strip())[1].strip(),
                   website=item.get('Internet'),
                   email=item.get('E-Mail-Adresse').strip(),
                   fax=BremenSpider.fix_number(item.get('Telefax')),
                   phone=BremenSpider.fix_number(item.get('Telefon')))
Esempio n. 17
0
 def normalize(item: Item) -> School:
     name = " ".join(
         [item.get('schulname', ''),
          item.get('namenszuatz', '')]).strip()
     address = item.get('sdb_adressen', [{}])[0]
     ort = address.get('sdb_ort', {})
     school_type = NiedersachsenSpider._get(item, 'sdb_art', {}).get('art')
     provider = NiedersachsenSpider._get(item, 'sdb_traeger',
                                         {}).get('name')
     return School(name=name,
                   phone=item.get('telefon'),
                   fax=item.get('fax'),
                   email=item.get('email'),
                   website=item.get('homepage'),
                   address=address.get('strasse'),
                   zip=ort.get('plz'),
                   city=ort.get('ort'),
                   school_type=school_type,
                   provider=provider,
                   legal_status=item.get("sdb_traegerschaft",
                                         {}).get('bezeichnung'),
                   id='NI-{}'.format(item.get('schulnr')))
Esempio n. 18
0
 def process_item(self, item: Item, spider):
     if item['url'] in self.ids_seen:
         raise DropItem("Duplicate item found: %s" % item.get('url', item))
     else:
         self.ids_seen.add(item['url'])
         return item
Esempio n. 19
0
 def normalize(item: Item) -> School:
     return School(name=item.get('title'),
                   id='SN-{}'.format(item.get('Dienststellenschlüssel')),
                   address=item.get('Postanschrift'),
                   website=item.get('Homepage'),
                   email=item.get('E-Mail'),
                   school_type=item.get('Einrichtungsart'),
                   legal_status=item.get('Rechtsstellung'),
                   provider=item.get('Schulträger'),
                   fax=item.get('Telefax'),
                   phone=item.get('phone_numbers'),
                   director=item.get('Schulleiter'))
Esempio n. 20
0
 def normalize(self, item: Item) -> School:
     return School(name=item.get('name'),
                   id='BE-{}'.format(item.get('id')),
                   address=item.get('address'),
                   zip=item.get('zip'),
                   city=item.get('city'),
                   website=item.get('web'),
                   email=item.get('mail'),
                   school_type=item.get('schooltype'),
                   fax=item.get('fax'),
                   phone=item.get('telephone'),
                   director=item.get('headmaster'),
                   legal_status=item.get('legal_status'))
Esempio n. 21
0
    def insert_sql_values(self, table: str,
                          item: scrapy.Item) -> Tuple[str, Tuple]:
        movie_id = item.get(Item.MOVIE_ID_NAME, 0)
        title = item.get(Item.TITLE_NAME, "")
        director = item.get(Item.DIRECTOR_NAME, "")
        author = item.get(Item.AUTHOR_NAME, "")
        actor = item.get(Item.ACTOR_NAME, "")
        region = item.get(Item.REGION_NAME, "")
        lang = item.get(Item.LANG_NAME, "")
        genre = item.get(Item.GENRE_NAME, "")
        release = item.get(Item.RELEASE_NAME, "")
        episode = item.get(Item.EPISODE_NAME, "")
        duration = item.get(Item.DURATION_NAME, "")
        runtime = item.get(Item.RUNTIME_NAME, "")
        average = item.get(Item.AVERAGE_NAME, "")
        votes = item.get(Item.VOTES_NAME, "")

        logging.info(("INSERT OR IGNORE INTO %s VALUES("
                      "%d, '%s', '%s', '%s', '%s', '%s', '%s', "
                      "'%s', '%s', '%s', '%s', '%s', '%s', '%s')"), table,
                     movie_id, title, director, author, actor, region, lang,
                     genre, release, episode, duration, runtime, average,
                     votes)

        return (
            f"INSERT OR IGNORE INTO {table} VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
            (movie_id, title, director, author, actor, region, lang, genre,
             release, episode, duration, runtime, average, votes))
Esempio n. 22
0
    def normalize(item: Item) -> School:
        v = list(item.get('phone_numbers').values())
        phone_numbers = v[0] if len(v) > 0 else None

        address_objects = re.split('\d{5}', item.get('Postanschrift').strip())
        if len(address_objects) == 0:
            address = ''
            zip = ''
            city = ''
        elif len(address_objects) == 1:
            address = ''
            zip = ''
            city = address_objects[0].strip()
        else:
            address = re.split('\d{5}', item.get('Postanschrift'))[0].strip()
            zip = re.findall('\d{5}', item.get('Postanschrift'))[0].strip()
            city = re.split('\d{5}', item.get('Postanschrift'))[1].strip()

        return School(name=item.get('title'),
                      id='SN-{}'.format(item.get('Dienststellenschlüssel')),
                      address=address,
                      zip=zip,
                      city=city,
                      website=item.get('Homepage'),
                      email=item.get('E-Mail'),
                      school_type=item.get('Einrichtungsart'),
                      legal_status=item.get('Rechtsstellung'),
                      provider=item.get('Schulträger'),
                      fax=item.get('Telefax'),
                      phone=phone_numbers,
                      director=item.get('Schulleiter')
                      or item.get('Schulleiter/in'))
 def normalize(item: Item) -> School:
     return School(name=item.get('name'),
                   id='BW-{}'.format(item.get('id')),
                   address=item.get('Strasse'),
                   zip=item.get('PLZ'),
                   city=item.get('Ort'),
                   website=item.get('Internet'),
                   email=item.get('E-Mail'),
                   fax=item.get('Fax'),
                   phone=item.get('Telefon'),
                   provider=item.get('Schulamt'),
                   director=item.get('Schulleitung'),
                   school_type='')    
 def normalize(item: Item) -> School:
     dst = str(item.get('Dst-Nr.:')).replace('.0', '')
     plz = str(item.get('Plz')).replace('.0', '')
     return School(name=item.get('Schulname'),
                   id='MV-{}'.format(dst),
                   address=item.get('Straße, Haus-Nr.'),
                   address2='',
                   zip=plz,
                   city=item.get('Ort'),
                   website=item.get('Homepage'),
                   email=item.get('E-Mail'),
                   school_type=item.get('Schulart/ Org.form'),
                   fax=item.get('Telefax'),
                   phone=item.get('Telefon'),
                   provider=item.get('Schul-behörde'),
                   director=item.get('Schulleitung'))
Esempio n. 25
0
 def normalize(item: Item) -> School:
     return School(name=item.get('title'),
                   id='SN-{}'.format(item.get('Dienststellenschlüssel')),
                   address=re.split('\d{5}', item.get('Postanschrift').strip())[0].strip(),
                   zip=re.findall('\d{5}', item.get('Postanschrift').strip())[0],
                   city=re.split('\d{5}', item.get('Postanschrift').strip())[1].strip(),
                   website=item.get('Homepage'),
                   email=item.get('E-Mail'),
                   school_type=item.get('Einrichtungsart'),
                   legal_status=item.get('Rechtsstellung'),
                   provider=item.get('Schulträger'),
                   fax=item.get('Telefax'),
                   phone=list(item.get('phone_numbers').values())[0],
                   director=item.get('Schulleiter') or item.get('Schulleiter/in'))
Esempio n. 26
0
    def normalize(item: Item) -> School:
        name = " ".join([
            item.get("Schulbezeichnung_1", ""),
            item.get("Schulbezeichnung_2", ""),
            item.get("Schulbezeichnung_3", "")
        ]).strip()
        helper = NordRheinWestfalenHelper()
        right, high = item.get('UTMRechtswert'), item.get('UTMHochwert')
        this_projection = Proj(item.get('EPSG'))
        target_projection = Proj('epsg:4326')
        lon, lat = transform(this_projection, target_projection, right, high)

        return School(
            name=name,
            id='NW-{}'.format(item.get('Schulnummer')),
            address=item.get('Strasse'),
            zip=item.get("PLZ"),
            city=item.get('Ort'),
            website=item.get('Homepage'),
            email=item.get('E-Mail'),
            legal_status=helper.resolve('rechtsform', item.get('Rechtsform')),
            school_type=helper.resolve('schulform', item.get('Schulform')),
            provider=helper.resolve('provider', item.get('Traegernummer')),
            fax=f"{item.get('Faxvorwahl')}{item.get('Fax')}",
            phone=f"{item.get('Telefonvorwahl')}{item.get('Telefon')}",
            latitude=lat,
            longitude=lon,
        )