def testHmlParser(xmlFileName=None, outputDirectory=None, alignSequences=False): print('Testing the HML Parser with filename:' + str(xmlFileName)) xmlText = open(xmlFileName, 'r').read() #print('xmlText:\n' + str(xmlText)) hmlObject = ParseXml.parseXmlFromText(xmlText=xmlText) sampleIds = ParseXml.getSampleIDs(hml=hmlObject) hmlId = ParseXml.getHmlid(xmlText=xmlText) glStrings = ParseXml.getGlStrings(hml=hmlObject) print('I found this HMLID:' + str(hmlId)) print('I found these SampleIDs:' + str(sampleIds)) print('I found this glStrings:' + str(glStrings)) glStringValidity, glStringValidationFeedback = Validation.validateGlStrings( glStrings=glStrings) print('glstringValidity:' + str(glStringValidity)) print('glStringValidationFeedback:' + str(glStringValidationFeedback)) # Write some data from the HML to file (These are named based on sample ID) hmlObject.tobiotype(outputDirectory, dtype='fasta', by='subject') xmlDirectory = join(getcwd(), 'XmlValidator/xml') isValid, validationResults = ParseXml.extrapolateConsensusFromVariants( hml=hmlObject, outputDirectory=outputDirectory, xmlDirectory=xmlDirectory, alignSequences=alignSequences) print('IsValid:' + str(isValid)) print('validationResults:' + str(validationResults))
def writeToFile(self, folder, index, spec=False, keepPriceList=False): try: index = '%0*d' % (6, index) outputFile = folder + "/" + index + "_" + Validator.RevalidName( self.name) + ".json" if spec == True: with codecs.open(outputFile, "w", encoding='utf-8') as f: f.write(unicode(self.toJSON(keepPriceList)).encode("utf8")) else: with io.open(outputFile, 'w', encoding='utf-8') as f: f.write( self.toJSON(keepPriceList).decode('unicode-escape')) except BaseException as ex: print('Error when write json file: ', ex) raise
def writeToFile(self, folder, index, filename, spec=False): try: if filename != None: filename = Validator.RevalidName(filename) index = '%0*d' % (7, index) if len(filename) > 260 - len(folder) - 12: filename = Util.SubString(filename, 260 - len(folder) - 12) outputFile = folder + "/" + index + "_" + filename + ".json" jObject = self.toJSON() if spec == True: with codecs.open(outputFile, "w", encoding='utf-8') as f: f.write(unicode(jObject).encode("utf8")) else: with io.open(outputFile, 'w', encoding='utf-8') as f: f.write(jObject.decode('unicode-escape')) except BaseException as ex: print('Error when write json file: ', ex) raise
def doWork(self): self.outFile = self.folder + '/' + self._chain_ + '_' + Validation.RevalidName(self.__name__) + '_Venues.csv' self.phoneCodeList = Util.getPhoneCodeList() self.__getListCities() if len(self.listCities) > 0: self.listCities = list(set(self.listCities)) self.__getListVenues() if len(self._lstVenues) > 0: listWrite2File = [] for i in range(len(self._lstVenues)): try: ven = self.__VenueParser(self._lstVenues[i]) if ven != None: listWrite2File.append(ven.toOrderDict(False)) except Exception,ex: print "URL: " + self._lstVenues[i].scrape_page + ": " + ex.message Util.log.error("URL: " + self._lstVenues[i].scrape_page + ": " + ex.message) Util.writelist2File(listWrite2File,self.outFile)
def __reValidInfo(self): self.adid = Validator.ReValidString(self.adid) self.name = Validator.ReValidString(self.name) self.name_of_contact = Validator.ReValidString(self.name_of_contact) self.business_website = Validator.RevalidURL(self.business_website) self.areas_covered = Validator.ReValidString(self.areas_covered) self.formatted_address = Validator.ReValidString( self.formatted_address) self.description = Validator.ReValidString(self.description) if self.img_link != None and len(self.img_link) > 0: self.img_link = [Validator.RevalidURL(x) for x in self.img_link] self.img_link = '[' + ', '.join(self.img_link) + ']' if self.img_link != None and len(self.img_link) == 0: self.img_link = None if self.hqdb_featured_ad_type != None and len( self.hqdb_featured_ad_type) > 0: self.hqdb_featured_ad_type = [ '"' + Validator.ReValidString(x.lower()) + '"' for x in self.hqdb_featured_ad_type ] self.hqdb_featured_ad_type = '[' + ', '.join( self.hqdb_featured_ad_type) + ']' elif self.hqdb_featured_ad_type != None and len( self.hqdb_featured_ad_type) == 0: self.hqdb_featured_ad_type = None self.hqdb_nr_reviews = Validator.ReValidString(self.hqdb_nr_reviews) self.hqdb_review_score = Validator.ReValidString( self.hqdb_review_score) self.accreditations = Validator.ReValidString(self.accreditations) self.scrape_page = Validator.RevalidURL(self.scrape_page) self.category = Validator.ReValidString(self.category) self.subcategory = Validator.ReValidString(self.subcategory) self.street = Validator.ReValidString(self.street) self.city = Validator.ReValidString(self.city) type = 'phone' if self.country == 'pl': type = 'pl' self.zipcode = Validator.ReValidPhone(self.zipcode, type) self.country = Validator.ReValidString(self.country) self.office_number = Validator.ReValidPhone(self.office_number) self.mobile_number = Validator.ReValidPhone(self.mobile_number) self.office_number2 = Validator.ReValidPhone(self.office_number2) self.mobile_number2 = Validator.ReValidPhone(self.mobile_number2) if self.unidentified_phone_numbers != None and len( self.unidentified_phone_numbers) > 0: self.unidentified_phone_numbers = [ Validator.ReValidPhone(x).strip() for x in self.unidentified_phone_numbers ] self.unidentified_phone_numbers = '[' + ", ".join( self.unidentified_phone_numbers) + ']' elif self.unidentified_phone_numbers != None and len( self.unidentified_phone_numbers) == 0: self.unidentified_phone_numbers = None self.latitude = Validator.ReValidString(self.latitude) self.longitude = Validator.ReValidString(self.longitude) self.business_email = Validator.RevalidEmail( Validator.ReValidString(self.business_email)) self.yelp_page = Validator.RevalidURL(self.yelp_page) #self.facebook = Validator.RevalidURL(self.facebook) self.twitter = Validator.RevalidURL(self.twitter) #self.instagram = Validator.RevalidURL(self.instagram) self.venue_images = Validator.ReValidString(self.venue_images) if self.is_get_by_address == False: if Validator.ValidateGeoCode(self.formatted_address, self.country, self.latitude, self.longitude) == False: Util.log.invalid( 'GEO code', self.scrape_page + ': invalid GEO code (' + self.latitude + ',' + self.longitude + ')') self.latitude = None self.longitude = None self.opening_hours_raw = Validator.ReValidString( self.opening_hours_raw)
def __reValidInfo(self): self.service_category = Validator.ReValidString(self.service_category) self.service = Validator.ReValidString(self.service) self.price = Validator.ReValidString(self.price) self.description = Validator.ReValidString(self.description)
def __reValidInfoJSON(self): self.scrape_page = self.scrape_page.replace(' ', '%20') self.adid = Validator.ReValidString(self.adid) self.name = Validator.ReValidString(self.name) self.name_of_contact = Validator.ReValidString(self.name_of_contact) if self.business_website != None: self.business_website = self.business_website.replace(' ', '%20') if Validator.RevalidURL(self.business_website) == False: Util.log.running_logger.error('{0}: {1}: {2}'.format( self.scrape_page, 'Invalid URL', self.business_website)) self.business_website = None self.areas_covered = Validator.ReValidString(self.areas_covered) self.formatted_address = Validator.ReValidString( self.formatted_address) self.description = Validator.ReValidString(self.description) if self.img_link != None and isinstance(self.img_link, list) == True: img_temp = [] for img in self.img_link: img = img.replace(' ', '%20') if Validator.RevalidURL(img) == False: Util.log.running_logger.error('{0}: {1}: {2}'.format( self.scrape_page, 'Invalid Img_Link', img)) else: img_temp.append(img) self.img_link = img_temp elif self.img_link != None: print 'img_link: is not a list' return False self.hqdb_featured_ad_type = Validator.ReValidString( self.hqdb_featured_ad_type) self.hqdb_nr_reviews = Validator.ReValidString(self.hqdb_nr_reviews) self.hqdb_review_score = Validator.ReValidString( self.hqdb_review_score) self.hqdb_ad_posted = Validator.ReValidString(self.hqdb_ad_posted) self.accreditations = Validator.ReValidString(self.accreditations) self.category = Validator.ReValidString(self.category) self.subcategory = Validator.ReValidString(self.subcategory) self.street = Validator.ReValidString(self.street) self.city = Validator.ReValidString(self.city) type = 'phone' if self.country == 'pl': type = 'pl' self.zipcode = Validator.ReValidPhone(self.zipcode, type) self.country = Validator.ReValidString(self.country) self.getFullAddress() self.formatted_address = Validator.ReValidString( self.formatted_address) if self.formatted_address != None: self.formatted_address = self.formatted_address.replace(',,', ',') self.office_number = Validator.ReValidPhone(self.office_number) self.office_number = Validator.ValidPhone(self.office_number, self.country, self.scrape_page) self.mobile_number = Validator.ReValidPhone(self.mobile_number) self.mobile_number = Validator.ValidPhone(self.mobile_number, self.country, self.scrape_page) self.office_number2 = Validator.ReValidPhone(self.office_number2) self.office_number2 = Validator.ValidPhone(self.office_number2, self.country, self.scrape_page) if self.office_number2 != None and self.office_number == None: self.office_number = self.office_number2 self.office_number2 = None self.mobile_number2 = Validator.ReValidPhone(self.mobile_number2) self.mobile_number2 = Validator.ValidPhone(self.mobile_number2, self.country, self.scrape_page) if self.mobile_number2 != None and self.mobile_number == None: self.mobile_number = self.mobile_number2 self.mobile_number2 = None if self.unidentified_phone_numbers != None and len( self.unidentified_phone_numbers) > 0: self.unidentified_phone_numbers = [ Validator.ReValidPhone(x).strip() for x in self.unidentified_phone_numbers ] self.unidentified_phone_numbers = [ Validator.ValidPhone(x, self.country, self.scrape_page) for x in self.unidentified_phone_numbers ] self.unidentified_phone_numbers = [ x for x in self.unidentified_phone_numbers if x != None ] elif self.unidentified_phone_numbers != None and len( self.unidentified_phone_numbers) == 0: self.unidentified_phone_numbers = None self.latitude = Validator.ReValidString(self.latitude) self.longitude = Validator.ReValidString(self.longitude) if self.business_email != None: if Validator.RevalidEmail(self.business_email) == False: Util.log.running_logger.error('{0}: {1}: {2}'.format( self.scrape_page, 'Invalid Email', self.business_email)) self.business_email = None if self.yelp_page != None and Validator.RevalidURL( self.yelp_page) == False: self.yelp_page = None #if self.facebook != None and Validator.RevalidURL(self.facebook) == False: # self.facebook = None if self.twitter != None and Validator.RevalidURL( self.twitter) == False: self.twitter = None #if self.instagram != None and Validator.RevalidURL(self.instagram) == False: # self.instagram = None self.venue_images = Validator.ReValidString(self.venue_images) if self.venue_images != None: self.venue_images = self.venue_images.replace(' ', '%20') if Validator.RevalidURL(self.venue_images) == False: Util.log.running_logger.error('{0}: {1}: {2}'.format( self.scrape_page, "Invalid Venue_Image: ", self.venue_images)) self.venue_images = None if self.is_get_by_address == False: if Validator.ValidateGeoCode(self.formatted_address, self.country, self.latitude, self.longitude, self.scrape_page) == False: self.latitude = None self.longitude = None self.opening_hours_raw = Validator.ReValidString( self.opening_hours_raw) if self.pricelist_link != None and isinstance(self.pricelist_link, list) == False: Util.log.running_logger.error( '[PriceListLink]: {0} is not List'.format(self.pricelist_link)) return False elif self.pricelist_link != None and len(self.pricelist_link) == 0: self.pricelist_link = None elif self.pricelist_link != None: self.pricelist_link = [ x.replace(' ', '%20') for x in self.pricelist_link ] return True
def __VenueParser(self,ven): url = ven.scrape_page print 'Scrapping: ' + url.encode('utf8').encode('string-escape') xpathVenue = '//header[@class="clearfix space-mbs"]|//div[@id="vip-tabs-images"]|//div[@id="vip-tabs-map"]|//section[@itemtype="http://schema.org/Person" and @data-sticky-header-target="reply.box.2"]|//p[@class="ad-description" and @itemprop="description"]|//ul[@class="inline-list-slash media-body"]|//script[contains(text(),"revealSellerTelephoneNumberToken")]' xmlVenue = Util.getRequestsXML(url,xpathVenue) if xmlVenue != None: website = xmlVenue.xpath('.//a[@class="truncate-line"]') if len(website) > 0: website = website[0] if website.get('href') != None: ven.website = website.get('href') elif website.text != None: ven.website = website.text.replace('\r','').replace('\n','').strip() ul = xmlVenue.find('./ul') if ul != None and len(ul) > 0: ul = ul.xpath('.//text()') ul = [x.replace('\r','').replace('\n','').strip() for x in ul if x.replace('\r','').replace('\n','').strip() != ''] ven.category = ul[-1] header = xmlVenue.find('./header') if header != None and header.find('./strong') != None: ven.formatted_address = "".join(header.find('./strong').itertext()) listImage = xmlVenue.find('./div[@id="vip-tabs-images"]') if listImage != None: listImage = listImage.xpath('.//img') if len(listImage) > 0: ven.img_link = [] for img in listImage: if img.get('src') != None and img.get('src').strip() != '': ven.img_link.append(img.get('src')) elif img.get('src') == None and img.get('data-lazy') != None and img.get('data-lazy').strip() != '': ven.img_link.append(img.get('data-lazy')) map = xmlVenue.find('./div[@id="vip-tabs-map"]') if map != None and map.find('./div[@class="googlemap"]') != None and map.find('./div[@class="googlemap"]').get('data-googlemap') != None: map = map.find('./div[@class="googlemap"]').get('data-googlemap') lat = map[map.find('latitude:') + len('latitude:'):map.rfind(',')] lng = map[map.rfind(':') + 1:] try: float1 = float(lat) float2 = float(lng) ven.latitude = lat ven.longitude = lng except: '' section = xmlVenue.find('./section') if section != None: section = section.find('.//div[@class="media space-man h-underline-s"]/div[@class="media-body"]') if section != None: h2 = section.find('./h2') if h2 != None and h2.text != None: ven.name_of_contact = Validation.ReValidString(h2.text.replace('/','-')) span = section.find('./span') if span != None and span.text != None: ven.hqdb_ad_posted = span.text p = xmlVenue.find('./p') if p != None: p = p.xpath('.//text()') p = [x.replace('\r','').replace('\n','').strip() for x in p if x.replace('\r','').replace('\n','').strip() != ''] ven.description = " | ".join(p) script = xmlVenue.find('./script') if script != None and script.text != None: script = script.text script = script[script.find('revealSellerTelephoneNumberToken": "') + len('revealSellerTelephoneNumberToken": "'):] script = script[0:script.find('"')] phoneJson = self.__GetPhone(ven.adid,script) if phoneJson != None and phoneJson.get('data') != None: phone = phoneJson.get('data') phone = Util.removesingleSpace(phone) if phone.startswith('0044'): phone = '+44' + phone[4:] if phone.replace('+44','0').startswith('06') or phone.replace('+44','0').startswith('07'): if ven.mobile_number == None: ven.mobile_number = phone else: if ven.office_number == None: ven.office_number = phone if ven.office_number.startswith('00000'): ven.office_number = '' return ven
def validate(self): Util.log.info("----------------------- Validating Venue: " + self.name) Validator.CityZipcode(self.city, self.zipcode) Validator.Country(self.country) Validator.LatitudeLongitute(self.latitude, self.longitude, self.city, self.zipcode) Validator.Email(self.business_email) Validator.Link(self.business_website) Validator.PhoneNumber(self.office_number, self.country) Validator.PhoneNumber(self.mobile_number, self.country) Validator.PhoneNumber(self.office_number2, self.country) Validator.PhoneNumber(self.office_number2, self.country) Validator.PhoneNumber(self.unidentified_phone_numbers, self.country) Validator.Link(self.facebook_page) Validator.Link(self.yelp_page) if self.pricelist_link != None: for link in self.pricelist_link: Validator.Link(link)
def __reValidInfo(self, keepPriceList=False): self.name = Validator.ReValidString(self.name) self.scrape_page = Validator.ReValidString(self.scrape_page) self.street = Validator.ReValidString(self.street) self.city = Validator.ReValidString(self.city) type = 'phone' if self.country == 'pl': type = 'pl' self.zipcode = Validator.ReValidPhone(self.zipcode, type) self.country = Validator.ReValidString(self.country) self.getFullAddress() self.formatted_address = Validator.ReValidString( self.formatted_address) self.latitude = Validator.ReValidString(self.latitude) self.longitude = Validator.ReValidString(self.longitude) self.business_email = Validator.ReValidString(self.business_email) self.business_website = Validator.ReValidString(self.business_website) self.office_number = Validator.ReValidPhone(self.office_number) self.mobile_number = Validator.ReValidPhone(self.mobile_number) self.office_number2 = Validator.ReValidPhone(self.office_number2) self.mobile_number2 = Validator.ReValidPhone(self.mobile_number2) if self.unidentified_phone_numbers != None and len( self.unidentified_phone_numbers) > 0: self.unidentified_phone_numbers = [ Validator.ReValidPhone(x).strip() for x in self.unidentified_phone_numbers ] elif self.unidentified_phone_numbers != None and len( self.unidentified_phone_numbers) == 0: self.unidentified_phone_numbers = None self.opening_hours_raw = Validator.ReValidString( self.opening_hours_raw) self.facebook_page = Validator.ReValidString(self.facebook_page) self.yelp_page = Validator.ReValidString(self.yelp_page) self.description = Validator.ReValidString(self.description) self.pass_rate = Validator.ReValidString(self.pass_rate) if Validator.ValidateGeoCode(self.formatted_address, self.country, self.latitude, self.longitude) == False: Util.log.invalid( 'GEO code', self.name + ': invalid GEO code (' + self.latitude + ',' + self.longitude + ')') self.latitude = None self.longitude = None if len(self.services) <= 0 and keepPriceList == False: self.pricelist_link = None
def hml_parser_handler(event, context): print('I found the schema validation handler.') # This is the AWS Lambda handler function. xmlKey = None try: # Sleep 1 second, enough time to make sure the file is available. sleep(1) print('This is the event:' + str(event)[0:50]) content = json.loads(event['Records'][0]['Sns']['Message']) bucket = content['Records'][0]['s3']['bucket']['name'] xmlKey = urllib.parse.unquote_plus(content['Records'][0]['s3']['object']['key'], encoding='utf-8') xmlFileObject = s3.get_object(Bucket=bucket, Key=xmlKey) # TODO: Rather than read text, I can probably use the pyHML Parser from the aws object. xmlText = xmlFileObject["Body"].read() # Determine file extension. # I read an internet comment that this will treat the file as having no extension if it indeed does not have an extension. fileName, fileExtension = os.path.splitext(str(xmlKey).upper()) fileExtension = fileExtension.replace('.','') print('This file has the extension:' + fileExtension) # Get access stuff from the REST Endpoints url = IhiwRestAccess.getUrl() token = IhiwRestAccess.getToken(url=url) hmlUploadObject = IhiwRestAccess.getUploadByFilename(token=token, url=url, fileName=xmlKey) if(hmlUploadObject is None or 'type' not in hmlUploadObject.keys() or hmlUploadObject['type'] is None): print('Could not find the Upload object for upload ' + str(xmlKey) + '\nI will not continue.' ) return None fileType = hmlUploadObject['type'] validationResults = None if(fileType == 'HML'): print('This is an HML file, I will parse and validate it.') #hmlObject = ParseXml.parseXmlFromText(xmlText=xmlText) #sampleIds = ParseXml.getSampleIDs(hml=hmlObject) #hmlId = ParseXml.getHmlid(xmlText=xmlText) #glStrings = ParseXml.getGlStrings(hml=hmlObject) glStrings=[] documentRoot = ElementTree.fromstring(xmlText) #glStringNodes = documentRoot.findall('{http://schemas.nmdp.org/spec/hml/1.0.1}glstring') #if (len(glStringNodes) == 0): # print('No GL String nodes found') #else: for glStringNode in documentRoot.iter("*"): # print('Element Tag:' + str(element.tag)) if (str(glStringNode.tag) == str('{http://schemas.nmdp.org/spec/hml/1.0.1}glstring')): print('glStringNode:' + str(glStringNode)) glStringText = glStringNode.text if(glStringText is not None and len(glStringText.strip())>1): glStrings.append(glStringText.strip()) print('added glString:' + str(glStringText.strip())) else: print('Not glStringNode: ' + str(glStringNode)) pass isGlStringsValid, glStringValidationFeedback = Validation.validateGlStrings(glStrings=glStrings) IhiwRestAccess.setValidationStatus(uploadFileName=xmlKey, isValid=isGlStringsValid , validationFeedback=glStringValidationFeedback, url=url, token=token, validatorType='GLSTRING') else: print('This is not an HML file (file type=' + str(fileType) + ') I will not parse it.') except Exception as e: print('Exception:\n' + str(e) + '\n' + str(exc_info())) if (xmlKey is not None): url = IhiwRestAccess.getUrl() token = IhiwRestAccess.getToken(url=url) validationStatus = 'Exception Parsing HML file:' + str(e) print('I will try to set the status.') IhiwRestAccess.setValidationStatus(uploadFileName=xmlKey, isValid=False, validationFeedback=validationStatus, url=url, token=token, validatorType='GLSTRING') else: print('!!!!Failed setting the upload status because I could not identify the name of the xml file.') return str(e)