def parse_detail(self, response):
        category = response.css('.meta .category a::text').get() or ''
        name = helpers.fix_title(response.css('h2::text').get() or '')
        slug = helpers.get_slug(name)
        address = response.css('.node p::text').get() or ''
        city = response.css('.meta .tags a::text').get() or ''
        phone = response.css(
            '.field-field-telepon .field-item::text').get() or ''
        fax = response.css('.field-field-fax .field-item::text').get() or ''
        email = response.css(
            '.field-field-email .field-item::text').get() or ''
        website = response.css(
            '.field-field-website .field-item::text').get() or ''
        broker = response.css(
            '.field-field-broker .field-item::text').get() or ''
        npwp = response.css('.field-field-npwp .field-item::text').get(
        ).replace('NPWP', '').strip('\n :') or ''
        description = ''
        url = response.url or ''
        image_name = ''

        # if len(email) == 0:
        #     self.logger.info('{} : EMPTY EMAIL'.format(url))
        # if len(phone) == 0:
        #     self.logger.info('{} : EMPTY PHONE'.format(url))

        if self.name in website:
            website = ''

        # if len(email) > 0 and len(phone) > 0:
        image_url = response.css('img::attr(src)').get()
        if image_url is not None:
            image_url = image_url.strip()
            ext = image_url.split('.')[-1]
            image_name = slug
            target_dir = 'images/{}/{}.{}'.format(self.name, image_name, ext)
            self.logger.info('downloading image: {} => {}'.format(
                image_url, target_dir))
            r = helpers.download(image_url, target_dir)
            if not r:
                self.logger.info('Failed download {} => {}'.format(
                    image_url, target_dir))
        yield {
            'category': category.strip(),
            'name': name.strip(),
            'slug': slug.strip(),
            'address': address.strip(),
            'city': city.strip(),
            'phone': phone.strip(),
            'fax': fax.strip(),
            'email': email.strip(),
            'website': website.strip(),
            'broker': broker.strip(),
            'npwp': npwp.strip(),
            'description': description.strip(),
            'url': url.strip(),
            'image_name': image_name.strip(),
        }
Esempio n. 2
0
    for k, v in data.items():
        v = remove_unicode(v)
        v = v.replace('  ', ' ').replace('  ', ' ').replace('  ', ' ')
        data[k] = v

    return data


print('Load done data...')
done = {}
with open(file_reputasi, 'r', encoding='utf8') as f:
    for row in f.read().strip().split('\n'):
        row = json.loads(row)
        # print(helpers.fix_title(row['name']))
        # done[helpers.get_slug(helpers.fix_title(row['name']), '', True)] = row['url']
        done[helpers.get_slug(helpers.fix_title(row['slug']), '',
                              True)] = row['url']
print('{} done data loaded'.format(len(done)))

print('Load perusahaan data...')
perusahaan = []
skipped_counter = {
    'done': 0,
    'empty_name': 0,
    'empty_address': 0,
    'empty_phone': 0,
    'empty_email': 0,
    'invalid_phone': 0,
    'invalid_email': 0,
}
with open(file_source, 'r') as f:
    'website': 0,
}
done_slug = []
done_email = []
done_phone = []
done_website = []
clean = []
print("INFO: start cleaning...")
for row in data:
    # print(row[COL_NAME])
    row = clean_data(row)
    category = row[COL_CATEGORY]
    # sc = category.lower()
    # if sc not in categories:
    #     categories.append(sc)
    name = helpers.fix_title(row[COL_NAME])
    slug = helpers.get_slug(name)
    email = row[COL_EMAIL]
    phone = row[COL_PHONE]
    website = row[COL_WEBSITE]
    city = row[COL_CITY]
    if len(city) == 0:
        city = row[COL_ADDRESS].strip().split(' ')[-1].strip()  #.lower()
        row[COL_CITY] = city
    if slug in done_slug:
        duplicate['slug'] += 1
        print('INFO: dp slug => {}'.format(slug))
        continue
    if email in done_email:
        duplicate['email'] += 1
        print('INFO: dp email => {}'.format(email))
    def parse_detail(self, response):
        category = response.css('ol.breadcrumb.pull-left > li > a')[-1].css('::text').get() or ''
        name = response.css('h1.business-title span::text').get() or ''
        address = []
        city = response.css('span[itemprop=addressLocality]::text').get() or ''
        phone = response.css('span[itemprop=telephone]::text').get() or ''
        email = ''
        website = response.css('ul.dropdown-menu > li > a[itemprop=url]::attr(href)').get() or ''
        description = []
        url = response.url or ''

        # email
        try:
            cfemail = response.css('span.__cf_email__::attr(data-cfemail)').get() or ''
            if len(cfemail) > 0:
                email = helpers.cfDecodeEmail(cfemail)
        except:
            email = ''

        # address
        address_1 = response.css('h4 > span > span::text')
        address_2 = response.css('h4 > span::text')
        for index, a1 in enumerate(address_1):
            a1 = a1.get().strip()
            a2 = address_2[index].get().strip()
            address.append(a1)
            address.append(a2)
        address = ' '.join(address)
        address = address.replace(' ,', ',')

        # description
        for txt in response.css('.col-sm-12 > p p'):
            d = txt.css('::text').get() or ''
            description.append(d.strip())
        description = '. '.join(description)
        description = description.replace('..', '.')
        description = description.replace('. . ', '. ')
        description = description.replace('. . ', '. ')

        if len(email) == 0:
            self.logger.info('{} : EMPTY EMAIL'.format(url))
        if len(phone) == 0:
            self.logger.info('{} : EMPTY PHONE'.format(url))

        if len(email) > 0 and len(phone) > 0:
            image_url = response.css('.detail-listing-img > img::attr(src)').get()
            if image_url is not None and image_url[-1] != '/':
                image_url = image_url.strip()
                ext = image_url.split('.')[-1]
                image_name = helpers.get_slug(helpers.fix_title(name))
                target_dir = 'images/{}/{}'.format(self.name, image_name)
                self.logger.info('downloading image: {} => {}'.format(image_url, target_dir))
                helpers.download(image_url, target_dir)
            yield {
                'category': category.strip(),
                'name': name.strip(),
                'address': address.strip(),
                'city': city.strip(),
                'phone': phone.strip(),
                'email': email.strip(),
                'website': website.strip(),
                'description': description.strip(),
                'url': url.strip(),
            }
Esempio n. 5
0
    def parse_detail(self, response):
        category = ''
        name = ''
        address = ''
        city = ''
        phone = ''
        fax = ''
        email = ''
        website = ''
        description = ''
        url = response.url or ''
        image_url = ''
        # check type
        lis = response.css('.comp-body li')
        trs = response.css('table.table.description tr')
        if len(lis) > 0:
            # type 1
            for li in lis:
                k = li.css('::text').get().strip().split(':')[0].strip()
                v = li.css('::text').get().strip().split(':')[-1].strip()
                if len(k) == 0:
                    continue
                if 'Company Name' in k:
                    name = v
                elif 'Address' in k:
                    address = v
                elif 'Telephone' in k:
                    phone = li.css('a::text').get()
                elif 'Fax' in k:
                    fax = v
                elif 'Email' in k:
                    email = li.css('a::text').get()
            # description
            description = []
            for p in response.css('.comp-row > p::text'):
                txt = p.get().strip()
                if len(txt) == 0 or 'Description' in txt:
                    continue
                description.append(txt)
            description = ' '.join(description)
            # website
            website = response.css('.comp-row > p > a::attr(href)').get() or ''
            if self.name in website:
                website = ''
            # category
            category = response.css('.title-comp .col-sm-10::text')[-1].get()
            # image_url
            image_url = response.css('.img-container img::attr(src)').get() or ''
        elif len(trs) > 0:
            # type 2
            for tr in trs:
                k = tr.css('td::text')[0].get()
                v = tr.css('td::text')[-1].get()
                if len(k) == 0:
                    continue
                if 'Nama Perusahaan' in k:
                    name = v
                elif 'Alamat' in k:
                    address = tr.css('td')[-1].css('p::text').get()
                elif 'Kategori' in k:
                    category = v
                elif 'Telepon' in k:
                    phone = tr.css('td')[-1].css('a::text').get()
                elif 'Fax' in k:
                    fax = tr.css('td')[-1].css('a::text').get()
                elif 'Email' in k:
                    email = tr.css('td')[-1].css('a::text').get()
            # description
            description = []
            for p in response.css('.container > p::text'):
                txt = p.get().strip()
                if len(txt) == 0:
                    continue
                description.append(txt)
            description = ' '.join(description)
            # website
            website = response.css('a.btn.btn-contactus.btn-go-to::attr(href)').get() or ''
            if self.name in website:
                website = ''
            # image_url
            image_url = response.css('img.center-img::attr(src)').get() or ''

        if email is None or len(email) == 0:
            self.logger.info('{} : EMPTY EMAIL'.format(url))
            email = ''
        if phone is None or len(phone) == 0:
            self.logger.info('{} : EMPTY PHONE'.format(url))
            phone = ''

        # if len(email) > 0 and len(phone) > 0:
        name = helpers.fix_title(name)
        slug = helpers.get_slug(name)
        if image_url is not None and len(image_url) > 0:
            image_url = image_url.strip()
            ext = image_url.split('.')[-1]
            image_name = slug
            target_dir = 'images/{}/{}.{}'.format(self.name, image_name, ext)
            self.logger.info('downloading image: {} => {}'.format(image_url, target_dir))
            r = helpers.download(image_url, target_dir)
            if not r:
                self.logger.info('Failed download {} => {}'.format(image_url, target_dir))
        yield {
            'category': category.strip(),
            'name': name.strip(),
            'slug': slug.strip(),
            'address': address.strip(),
            'city': city.strip(),
            'phone': phone.strip(),
            'email': email.strip(),
            'website': website.strip(),
            'description': description.strip(),
            'url': url.strip(),
        }
Esempio n. 6
0
    def parse_detail(self, response):
        category = response.css('.breadcrumb li')[-2].css('::text').get() or ''
        name = helpers.fix_title(
            response.css('.breadcrumb li')[-1].css('::text').get() or '')
        slug = helpers.get_slug(name)
        address = ''
        city = ''
        phone = ''
        fax = ''
        email = ''
        website = ''
        description = ''
        url = response.url or ''

        for panel in response.css('.panel'):
            panel_title = panel.css('.col-xs-10.col-sm-11::text').get().strip()
            if 'Alamat' in panel_title:
                address = []
                for addr in panel.css('.panel-body::text'):
                    address.append(addr.get().strip())
                address = ', '.join(address)
            elif 'Telepon' in panel_title:
                phones = panel.css('.panel-body::text')
                if phones is not None:
                    phone = phones[0].get().strip()
                    if len(phones) > 1:
                        fax = phones[1].get().strip()
            elif 'Website' in panel_title:
                website = panel.css('.panel-body a::attr(href)').get().strip()
                if self.allowed_domains[0] in website:
                    website = ''
            elif 'Email' in panel_title:
                email = panel.css('.panel-body a::text').get()
            elif 'Tentang' in panel_title:
                description = []
                for desc in panel.css('.panel-body::text'):
                    description.append(desc.get().strip())
                description = ' '.join(description).strip()
                if len(description) == 0:
                    for desc in panel.css('.panel-body p::text'):
                        desc = desc.get().strip()
                        if len(desc) >= 200:
                            description = desc
                            break

        # if len(email) == 0:
        #     self.logger.info('{} : EMPTY EMAIL'.format(url))
        # if len(phone) == 0:
        #     self.logger.info('{} : EMPTY PHONE'.format(url))

        # if len(email) > 0 and len(phone) > 0:
        yield {
            'category': category.strip(),
            'name': name.strip(),
            'slug': slug.strip(),
            'address': address.strip(),
            'city': city.strip(),
            'phone': phone.strip(),
            'fax': fax.strip(),
            'email': email.strip(),
            'website': website.strip(),
            'description': description.strip(),
            'url': url.strip(),
        }