Example #1
0
def testHmlParser(xmlFileName=None,
                  outputDirectory=None,
                  alignSequences=False):
    print('Testing the HML Parser with filename:' + str(xmlFileName))
    xmlText = open(xmlFileName, 'r').read()
    #print('xmlText:\n' + str(xmlText))

    hmlObject = ParseXml.parseXmlFromText(xmlText=xmlText)
    sampleIds = ParseXml.getSampleIDs(hml=hmlObject)
    hmlId = ParseXml.getHmlid(xmlText=xmlText)
    glStrings = ParseXml.getGlStrings(hml=hmlObject)
    print('I found this HMLID:' + str(hmlId))
    print('I found these SampleIDs:' + str(sampleIds))
    print('I found this glStrings:' + str(glStrings))

    glStringValidity, glStringValidationFeedback = Validation.validateGlStrings(
        glStrings=glStrings)
    print('glstringValidity:' + str(glStringValidity))
    print('glStringValidationFeedback:' + str(glStringValidationFeedback))

    # Write some data from the HML to file (These are named based on sample ID)
    hmlObject.tobiotype(outputDirectory, dtype='fasta', by='subject')
    xmlDirectory = join(getcwd(), 'XmlValidator/xml')

    isValid, validationResults = ParseXml.extrapolateConsensusFromVariants(
        hml=hmlObject,
        outputDirectory=outputDirectory,
        xmlDirectory=xmlDirectory,
        alignSequences=alignSequences)
    print('IsValid:' + str(isValid))
    print('validationResults:' + str(validationResults))
Example #2
0
 def writeToFile(self, folder, index, spec=False, keepPriceList=False):
     try:
         index = '%0*d' % (6, index)
         outputFile = folder + "/" + index + "_" + Validator.RevalidName(
             self.name) + ".json"
         if spec == True:
             with codecs.open(outputFile, "w", encoding='utf-8') as f:
                 f.write(unicode(self.toJSON(keepPriceList)).encode("utf8"))
         else:
             with io.open(outputFile, 'w', encoding='utf-8') as f:
                 f.write(
                     self.toJSON(keepPriceList).decode('unicode-escape'))
     except BaseException as ex:
         print('Error when write json file: ', ex)
         raise
Example #3
0
 def writeToFile(self, folder, index, filename, spec=False):
     try:
         if filename != None:
             filename = Validator.RevalidName(filename)
         index = '%0*d' % (7, index)
         if len(filename) > 260 - len(folder) - 12:
             filename = Util.SubString(filename, 260 - len(folder) - 12)
         outputFile = folder + "/" + index + "_" + filename + ".json"
         jObject = self.toJSON()
         if spec == True:
             with codecs.open(outputFile, "w", encoding='utf-8') as f:
                 f.write(unicode(jObject).encode("utf8"))
         else:
             with io.open(outputFile, 'w', encoding='utf-8') as f:
                 f.write(jObject.decode('unicode-escape'))
     except BaseException as ex:
         print('Error when write json file: ', ex)
         raise
 def doWork(self):
     self.outFile = self.folder + '/' + self._chain_ + '_' + Validation.RevalidName(self.__name__) + '_Venues.csv'
     self.phoneCodeList = Util.getPhoneCodeList()
     self.__getListCities()
     
     if len(self.listCities) > 0:
         self.listCities = list(set(self.listCities))
         self.__getListVenues()
         if len(self._lstVenues) > 0: 
             listWrite2File = []               
             for i in range(len(self._lstVenues)):
                 try:
                     ven = self.__VenueParser(self._lstVenues[i])
                     if ven != None:
                         listWrite2File.append(ven.toOrderDict(False))
                 except Exception,ex:
                     print "URL: " + self._lstVenues[i].scrape_page + ": " + ex.message
                     Util.log.error("URL: " + self._lstVenues[i].scrape_page + ": " + ex.message)                        
             
             Util.writelist2File(listWrite2File,self.outFile)
Example #5
0
 def __reValidInfo(self):
     self.adid = Validator.ReValidString(self.adid)
     self.name = Validator.ReValidString(self.name)
     self.name_of_contact = Validator.ReValidString(self.name_of_contact)
     self.business_website = Validator.RevalidURL(self.business_website)
     self.areas_covered = Validator.ReValidString(self.areas_covered)
     self.formatted_address = Validator.ReValidString(
         self.formatted_address)
     self.description = Validator.ReValidString(self.description)
     if self.img_link != None and len(self.img_link) > 0:
         self.img_link = [Validator.RevalidURL(x) for x in self.img_link]
         self.img_link = '[' + ', '.join(self.img_link) + ']'
     if self.img_link != None and len(self.img_link) == 0:
         self.img_link = None
     if self.hqdb_featured_ad_type != None and len(
             self.hqdb_featured_ad_type) > 0:
         self.hqdb_featured_ad_type = [
             '"' + Validator.ReValidString(x.lower()) + '"'
             for x in self.hqdb_featured_ad_type
         ]
         self.hqdb_featured_ad_type = '[' + ', '.join(
             self.hqdb_featured_ad_type) + ']'
     elif self.hqdb_featured_ad_type != None and len(
             self.hqdb_featured_ad_type) == 0:
         self.hqdb_featured_ad_type = None
     self.hqdb_nr_reviews = Validator.ReValidString(self.hqdb_nr_reviews)
     self.hqdb_review_score = Validator.ReValidString(
         self.hqdb_review_score)
     self.accreditations = Validator.ReValidString(self.accreditations)
     self.scrape_page = Validator.RevalidURL(self.scrape_page)
     self.category = Validator.ReValidString(self.category)
     self.subcategory = Validator.ReValidString(self.subcategory)
     self.street = Validator.ReValidString(self.street)
     self.city = Validator.ReValidString(self.city)
     type = 'phone'
     if self.country == 'pl':
         type = 'pl'
     self.zipcode = Validator.ReValidPhone(self.zipcode, type)
     self.country = Validator.ReValidString(self.country)
     self.office_number = Validator.ReValidPhone(self.office_number)
     self.mobile_number = Validator.ReValidPhone(self.mobile_number)
     self.office_number2 = Validator.ReValidPhone(self.office_number2)
     self.mobile_number2 = Validator.ReValidPhone(self.mobile_number2)
     if self.unidentified_phone_numbers != None and len(
             self.unidentified_phone_numbers) > 0:
         self.unidentified_phone_numbers = [
             Validator.ReValidPhone(x).strip()
             for x in self.unidentified_phone_numbers
         ]
         self.unidentified_phone_numbers = '[' + ", ".join(
             self.unidentified_phone_numbers) + ']'
     elif self.unidentified_phone_numbers != None and len(
             self.unidentified_phone_numbers) == 0:
         self.unidentified_phone_numbers = None
     self.latitude = Validator.ReValidString(self.latitude)
     self.longitude = Validator.ReValidString(self.longitude)
     self.business_email = Validator.RevalidEmail(
         Validator.ReValidString(self.business_email))
     self.yelp_page = Validator.RevalidURL(self.yelp_page)
     #self.facebook = Validator.RevalidURL(self.facebook)
     self.twitter = Validator.RevalidURL(self.twitter)
     #self.instagram = Validator.RevalidURL(self.instagram)
     self.venue_images = Validator.ReValidString(self.venue_images)
     if self.is_get_by_address == False:
         if Validator.ValidateGeoCode(self.formatted_address, self.country,
                                      self.latitude,
                                      self.longitude) == False:
             Util.log.invalid(
                 'GEO code', self.scrape_page + ': invalid GEO code (' +
                 self.latitude + ',' + self.longitude + ')')
             self.latitude = None
             self.longitude = None
     self.opening_hours_raw = Validator.ReValidString(
         self.opening_hours_raw)
Example #6
0
 def __reValidInfo(self):
     self.service_category = Validator.ReValidString(self.service_category)
     self.service = Validator.ReValidString(self.service)
     self.price = Validator.ReValidString(self.price)
     self.description = Validator.ReValidString(self.description)
Example #7
0
    def __reValidInfoJSON(self):
        self.scrape_page = self.scrape_page.replace(' ', '%20')
        self.adid = Validator.ReValidString(self.adid)
        self.name = Validator.ReValidString(self.name)
        self.name_of_contact = Validator.ReValidString(self.name_of_contact)
        if self.business_website != None:
            self.business_website = self.business_website.replace(' ', '%20')
            if Validator.RevalidURL(self.business_website) == False:
                Util.log.running_logger.error('{0}: {1}: {2}'.format(
                    self.scrape_page, 'Invalid URL', self.business_website))
                self.business_website = None
        self.areas_covered = Validator.ReValidString(self.areas_covered)
        self.formatted_address = Validator.ReValidString(
            self.formatted_address)
        self.description = Validator.ReValidString(self.description)
        if self.img_link != None and isinstance(self.img_link, list) == True:
            img_temp = []
            for img in self.img_link:
                img = img.replace(' ', '%20')
                if Validator.RevalidURL(img) == False:
                    Util.log.running_logger.error('{0}: {1}: {2}'.format(
                        self.scrape_page, 'Invalid Img_Link', img))
                else:
                    img_temp.append(img)
            self.img_link = img_temp
        elif self.img_link != None:
            print 'img_link: is not a list'
            return False
        self.hqdb_featured_ad_type = Validator.ReValidString(
            self.hqdb_featured_ad_type)
        self.hqdb_nr_reviews = Validator.ReValidString(self.hqdb_nr_reviews)
        self.hqdb_review_score = Validator.ReValidString(
            self.hqdb_review_score)
        self.hqdb_ad_posted = Validator.ReValidString(self.hqdb_ad_posted)
        self.accreditations = Validator.ReValidString(self.accreditations)
        self.category = Validator.ReValidString(self.category)
        self.subcategory = Validator.ReValidString(self.subcategory)
        self.street = Validator.ReValidString(self.street)
        self.city = Validator.ReValidString(self.city)
        type = 'phone'
        if self.country == 'pl':
            type = 'pl'
        self.zipcode = Validator.ReValidPhone(self.zipcode, type)
        self.country = Validator.ReValidString(self.country)
        self.getFullAddress()
        self.formatted_address = Validator.ReValidString(
            self.formatted_address)
        if self.formatted_address != None:
            self.formatted_address = self.formatted_address.replace(',,', ',')

        self.office_number = Validator.ReValidPhone(self.office_number)
        self.office_number = Validator.ValidPhone(self.office_number,
                                                  self.country,
                                                  self.scrape_page)

        self.mobile_number = Validator.ReValidPhone(self.mobile_number)
        self.mobile_number = Validator.ValidPhone(self.mobile_number,
                                                  self.country,
                                                  self.scrape_page)

        self.office_number2 = Validator.ReValidPhone(self.office_number2)
        self.office_number2 = Validator.ValidPhone(self.office_number2,
                                                   self.country,
                                                   self.scrape_page)
        if self.office_number2 != None and self.office_number == None:
            self.office_number = self.office_number2
            self.office_number2 = None

        self.mobile_number2 = Validator.ReValidPhone(self.mobile_number2)
        self.mobile_number2 = Validator.ValidPhone(self.mobile_number2,
                                                   self.country,
                                                   self.scrape_page)
        if self.mobile_number2 != None and self.mobile_number == None:
            self.mobile_number = self.mobile_number2
            self.mobile_number2 = None

        if self.unidentified_phone_numbers != None and len(
                self.unidentified_phone_numbers) > 0:
            self.unidentified_phone_numbers = [
                Validator.ReValidPhone(x).strip()
                for x in self.unidentified_phone_numbers
            ]
            self.unidentified_phone_numbers = [
                Validator.ValidPhone(x, self.country, self.scrape_page)
                for x in self.unidentified_phone_numbers
            ]
            self.unidentified_phone_numbers = [
                x for x in self.unidentified_phone_numbers if x != None
            ]
        elif self.unidentified_phone_numbers != None and len(
                self.unidentified_phone_numbers) == 0:
            self.unidentified_phone_numbers = None
        self.latitude = Validator.ReValidString(self.latitude)
        self.longitude = Validator.ReValidString(self.longitude)
        if self.business_email != None:
            if Validator.RevalidEmail(self.business_email) == False:
                Util.log.running_logger.error('{0}: {1}: {2}'.format(
                    self.scrape_page, 'Invalid Email', self.business_email))
                self.business_email = None
        if self.yelp_page != None and Validator.RevalidURL(
                self.yelp_page) == False:
            self.yelp_page = None
        #if self.facebook != None and Validator.RevalidURL(self.facebook) == False:
        #    self.facebook = None

        if self.twitter != None and Validator.RevalidURL(
                self.twitter) == False:
            self.twitter = None
        #if self.instagram != None and Validator.RevalidURL(self.instagram) == False:
        #    self.instagram = None
        self.venue_images = Validator.ReValidString(self.venue_images)
        if self.venue_images != None:
            self.venue_images = self.venue_images.replace(' ', '%20')
            if Validator.RevalidURL(self.venue_images) == False:
                Util.log.running_logger.error('{0}: {1}: {2}'.format(
                    self.scrape_page, "Invalid Venue_Image: ",
                    self.venue_images))
                self.venue_images = None
        if self.is_get_by_address == False:
            if Validator.ValidateGeoCode(self.formatted_address, self.country,
                                         self.latitude, self.longitude,
                                         self.scrape_page) == False:
                self.latitude = None
                self.longitude = None
        self.opening_hours_raw = Validator.ReValidString(
            self.opening_hours_raw)
        if self.pricelist_link != None and isinstance(self.pricelist_link,
                                                      list) == False:
            Util.log.running_logger.error(
                '[PriceListLink]: {0} is not List'.format(self.pricelist_link))
            return False
        elif self.pricelist_link != None and len(self.pricelist_link) == 0:
            self.pricelist_link = None
        elif self.pricelist_link != None:
            self.pricelist_link = [
                x.replace(' ', '%20') for x in self.pricelist_link
            ]
        return True
 def __VenueParser(self,ven):        
     url = ven.scrape_page
     print 'Scrapping: ' + url.encode('utf8').encode('string-escape')
     xpathVenue = '//header[@class="clearfix space-mbs"]|//div[@id="vip-tabs-images"]|//div[@id="vip-tabs-map"]|//section[@itemtype="http://schema.org/Person" and @data-sticky-header-target="reply.box.2"]|//p[@class="ad-description" and @itemprop="description"]|//ul[@class="inline-list-slash media-body"]|//script[contains(text(),"revealSellerTelephoneNumberToken")]'   
     xmlVenue = Util.getRequestsXML(url,xpathVenue)
     if xmlVenue != None:
         website = xmlVenue.xpath('.//a[@class="truncate-line"]')
         if len(website) > 0:
             website = website[0]
             if website.get('href') != None:
                 ven.website = website.get('href')
             elif website.text != None:
                 ven.website = website.text.replace('\r','').replace('\n','').strip()
         ul = xmlVenue.find('./ul')
         if ul != None and len(ul) > 0:
             ul = ul.xpath('.//text()')
             ul = [x.replace('\r','').replace('\n','').strip() for x in ul if x.replace('\r','').replace('\n','').strip() != '']
             ven.category = ul[-1]
         header = xmlVenue.find('./header')
         if header != None and header.find('./strong') != None:
             ven.formatted_address = "".join(header.find('./strong').itertext())
         listImage = xmlVenue.find('./div[@id="vip-tabs-images"]')
         if listImage != None:
             listImage = listImage.xpath('.//img')
             if len(listImage) > 0:
                 ven.img_link = []
                 for img in listImage:
                     if img.get('src') != None and img.get('src').strip() != '':
                         ven.img_link.append(img.get('src'))                            
                     elif img.get('src') == None and img.get('data-lazy') != None and img.get('data-lazy').strip() != '':
                         ven.img_link.append(img.get('data-lazy')) 
         map = xmlVenue.find('./div[@id="vip-tabs-map"]')   
         if map != None and map.find('./div[@class="googlemap"]') != None and map.find('./div[@class="googlemap"]').get('data-googlemap') != None:
             map = map.find('./div[@class="googlemap"]').get('data-googlemap')
             lat = map[map.find('latitude:') + len('latitude:'):map.rfind(',')]                      
             lng = map[map.rfind(':') + 1:]
             try:
                 float1 = float(lat)
                 float2 = float(lng)
                 ven.latitude = lat
                 ven.longitude = lng
             except:
                 ''
         section = xmlVenue.find('./section')
         if section != None:
             section = section.find('.//div[@class="media space-man h-underline-s"]/div[@class="media-body"]')                   
             if section != None:
                 h2 = section.find('./h2')
                 if h2 != None and h2.text != None:
                     ven.name_of_contact = Validation.ReValidString(h2.text.replace('/','-'))
                 span = section.find('./span')
                 if span != None and span.text != None:
                     ven.hqdb_ad_posted = span.text
         p = xmlVenue.find('./p')
         if p != None:
             p = p.xpath('.//text()')
             p = [x.replace('\r','').replace('\n','').strip() for x in p if x.replace('\r','').replace('\n','').strip() != '']
             ven.description = " | ".join(p)
         script = xmlVenue.find('./script')
         if script != None and script.text != None:
             script = script.text
             script = script[script.find('revealSellerTelephoneNumberToken": "') + len('revealSellerTelephoneNumberToken": "'):]
             script = script[0:script.find('"')]
             phoneJson = self.__GetPhone(ven.adid,script)
             if phoneJson != None and phoneJson.get('data') != None:
                 phone = phoneJson.get('data')
                 phone = Util.removesingleSpace(phone)                    
                 if phone.startswith('0044'):
                     phone = '+44' + phone[4:]
                 if phone.replace('+44','0').startswith('06') or phone.replace('+44','0').startswith('07'):
                     if ven.mobile_number == None:
                         ven.mobile_number = phone                            
                 else:
                     if ven.office_number == None:
                         ven.office_number = phone
                         if ven.office_number.startswith('00000'):
                             ven.office_number = ''                            
     return ven
Example #9
0
 def validate(self):
     Util.log.info("----------------------- Validating Venue: " + self.name)
     Validator.CityZipcode(self.city, self.zipcode)
     Validator.Country(self.country)
     Validator.LatitudeLongitute(self.latitude, self.longitude, self.city,
                                 self.zipcode)
     Validator.Email(self.business_email)
     Validator.Link(self.business_website)
     Validator.PhoneNumber(self.office_number, self.country)
     Validator.PhoneNumber(self.mobile_number, self.country)
     Validator.PhoneNumber(self.office_number2, self.country)
     Validator.PhoneNumber(self.office_number2, self.country)
     Validator.PhoneNumber(self.unidentified_phone_numbers, self.country)
     Validator.Link(self.facebook_page)
     Validator.Link(self.yelp_page)
     if self.pricelist_link != None:
         for link in self.pricelist_link:
             Validator.Link(link)
Example #10
0
 def __reValidInfo(self, keepPriceList=False):
     self.name = Validator.ReValidString(self.name)
     self.scrape_page = Validator.ReValidString(self.scrape_page)
     self.street = Validator.ReValidString(self.street)
     self.city = Validator.ReValidString(self.city)
     type = 'phone'
     if self.country == 'pl':
         type = 'pl'
     self.zipcode = Validator.ReValidPhone(self.zipcode, type)
     self.country = Validator.ReValidString(self.country)
     self.getFullAddress()
     self.formatted_address = Validator.ReValidString(
         self.formatted_address)
     self.latitude = Validator.ReValidString(self.latitude)
     self.longitude = Validator.ReValidString(self.longitude)
     self.business_email = Validator.ReValidString(self.business_email)
     self.business_website = Validator.ReValidString(self.business_website)
     self.office_number = Validator.ReValidPhone(self.office_number)
     self.mobile_number = Validator.ReValidPhone(self.mobile_number)
     self.office_number2 = Validator.ReValidPhone(self.office_number2)
     self.mobile_number2 = Validator.ReValidPhone(self.mobile_number2)
     if self.unidentified_phone_numbers != None and len(
             self.unidentified_phone_numbers) > 0:
         self.unidentified_phone_numbers = [
             Validator.ReValidPhone(x).strip()
             for x in self.unidentified_phone_numbers
         ]
     elif self.unidentified_phone_numbers != None and len(
             self.unidentified_phone_numbers) == 0:
         self.unidentified_phone_numbers = None
     self.opening_hours_raw = Validator.ReValidString(
         self.opening_hours_raw)
     self.facebook_page = Validator.ReValidString(self.facebook_page)
     self.yelp_page = Validator.ReValidString(self.yelp_page)
     self.description = Validator.ReValidString(self.description)
     self.pass_rate = Validator.ReValidString(self.pass_rate)
     if Validator.ValidateGeoCode(self.formatted_address, self.country,
                                  self.latitude, self.longitude) == False:
         Util.log.invalid(
             'GEO code', self.name + ': invalid GEO code (' +
             self.latitude + ',' + self.longitude + ')')
         self.latitude = None
         self.longitude = None
     if len(self.services) <= 0 and keepPriceList == False:
         self.pricelist_link = None
Example #11
0
def hml_parser_handler(event, context):
    print('I found the schema validation handler.')
    # This is the AWS Lambda handler function.
    xmlKey = None
    try:
        # Sleep 1 second, enough time to make sure the file is available.
        sleep(1)
        print('This is the event:' + str(event)[0:50])

        content = json.loads(event['Records'][0]['Sns']['Message'])

        bucket = content['Records'][0]['s3']['bucket']['name']
        xmlKey = urllib.parse.unquote_plus(content['Records'][0]['s3']['object']['key'], encoding='utf-8')
        xmlFileObject = s3.get_object(Bucket=bucket, Key=xmlKey)
        # TODO: Rather than read text, I can probably use the pyHML Parser from the aws object.
        xmlText = xmlFileObject["Body"].read()

        # Determine file extension.
        # I read an internet comment that this will treat the file as having no extension if it indeed does not have an extension.
        fileName, fileExtension = os.path.splitext(str(xmlKey).upper())
        fileExtension = fileExtension.replace('.','')
        print('This file has the extension:' + fileExtension)

        # Get access stuff from the REST Endpoints
        url = IhiwRestAccess.getUrl()
        token = IhiwRestAccess.getToken(url=url)

        hmlUploadObject = IhiwRestAccess.getUploadByFilename(token=token, url=url, fileName=xmlKey)
        if(hmlUploadObject is None or 'type' not in hmlUploadObject.keys() or hmlUploadObject['type'] is None):
            print('Could not find the Upload object for upload ' + str(xmlKey) + '\nI will not continue.' )
            return None
        fileType = hmlUploadObject['type']

        validationResults = None
        if(fileType == 'HML'):
            print('This is an HML file, I will parse and validate it.')


            #hmlObject = ParseXml.parseXmlFromText(xmlText=xmlText)
            #sampleIds = ParseXml.getSampleIDs(hml=hmlObject)
            #hmlId = ParseXml.getHmlid(xmlText=xmlText)
            #glStrings = ParseXml.getGlStrings(hml=hmlObject)

            glStrings=[]
            documentRoot = ElementTree.fromstring(xmlText)
            #glStringNodes = documentRoot.findall('{http://schemas.nmdp.org/spec/hml/1.0.1}glstring')
            #if (len(glStringNodes) == 0):
            #    print('No GL String nodes found')
            #else:
            for glStringNode in documentRoot.iter("*"):
                # print('Element Tag:' + str(element.tag))
                if (str(glStringNode.tag) == str('{http://schemas.nmdp.org/spec/hml/1.0.1}glstring')):

                    print('glStringNode:' + str(glStringNode))
                    glStringText = glStringNode.text

                    if(glStringText is not None and len(glStringText.strip())>1):
                        glStrings.append(glStringText.strip())
                        print('added glString:' + str(glStringText.strip()))
                else:
                    print('Not glStringNode: ' + str(glStringNode))
                    pass

            isGlStringsValid, glStringValidationFeedback = Validation.validateGlStrings(glStrings=glStrings)
            IhiwRestAccess.setValidationStatus(uploadFileName=xmlKey, isValid=isGlStringsValid
                 , validationFeedback=glStringValidationFeedback, url=url, token=token, validatorType='GLSTRING')

        else:
            print('This is not an HML file (file type=' + str(fileType) + ') I will not parse it.')

    except Exception as e:
        print('Exception:\n' + str(e) + '\n' + str(exc_info()))
        if (xmlKey is not None):
            url = IhiwRestAccess.getUrl()
            token = IhiwRestAccess.getToken(url=url)
            validationStatus = 'Exception Parsing HML file:' + str(e)
            print('I will try to set the status.')
            IhiwRestAccess.setValidationStatus(uploadFileName=xmlKey, isValid=False, validationFeedback=validationStatus, url=url,
                                token=token, validatorType='GLSTRING')
        else:
            print('!!!!Failed setting the upload status because I could not identify the name of the xml file.')
        return str(e)