def check_function_def_above_main(self, code): prototype = check_if_function_prototype(code) function = check_if_function(code) inside = Literal("int main") if len(inside.searchString(code)): return elif function and not prototype and self.outside_main: function_regex = re.compile("^\s*(\w+)\s+(\w+)") match = function_regex.search(code) function_name = match.group(2) if match else "NOT_FOUND" self.add_error(label="DEFINITION_ABOVE_MAIN", data={'function': function_name})
def check_main_prefix(self, code): #Return value for main is optional in C++11 parser = Literal("main")+Literal("(")+SkipTo(Literal(")"))+Literal(")")+Literal("{") if len(parser.searchString(code)): main_prefix = Literal("main")+Literal("(") full_use = "int"+Word(alphanums)+","+"char*"+Word(alphanums)+"["+"]"+")" # 3 options for main() syntax if not len((main_prefix+Literal(")")).searchString(code)) and \ not len((main_prefix+Literal("void")+Literal(")")).searchString(code)) and \ not len((main_prefix+full_use).searchString(code)): self.add_error("MAIN_SYNTAX")
def check_main_syntax(self, code): # Return value for main is optional in C++11 parser = Literal("int") + Literal("main") + Literal("(") + SkipTo(Literal(")")) + Literal(")") if len(parser.searchString(code)): main_prefix = Literal("int") + Literal("main") + Literal("(") full_use = Literal("int") + "argc" + "," + Optional("const") + "char" + "*" + "argv" + "[" + "]" + ")" # 3 options for main() syntax if not len((main_prefix + Literal(")")).searchString(code)) and \ not len((main_prefix + Literal("void") + Literal(")")).searchString(code)) and \ not len((main_prefix + full_use).searchString(code)): self.add_error(label="MAIN_SYNTAX")
def get_log_formats(config): """ Parse config for log_format directives :return: iterator over ('format name', 'format string') tuple of found directives """ # log_format name [params] log_format = Literal('log_format') + parameter + Group(OneOrMore(parameter)) + semicolon log_format.ignore(pythonStyleComment) for directive in log_format.searchString(config).asList(): name = directive[1] format_string = ''.join(directive[2]) yield name, format_string
def check_main_syntax(self, code): # Return value for main is optional in C++11 parser = Literal("int") + Literal("main") + Literal("(") + SkipTo( Literal(")")) + Literal(")") if len(parser.searchString(code)): main_prefix = Literal("int") + Literal("main") + Literal("(") full_use = Literal("int") + "argc" + "," + Optional( "const") + "char" + "*" + "argv" + "[" + "]" + ")" # 3 options for main() syntax if not len((main_prefix + Literal(")")).searchString(code)) and \ not len((main_prefix + Literal("void") + Literal(")")).searchString(code)) and \ not len((main_prefix + full_use).searchString(code)): self.add_error(label="MAIN_SYNTAX")
def check_non_const_global(self, code): inside = Literal("int main") if len(inside.searchString(code)): self.outside_main = False if self.outside_main: function = check_if_function(code) variables = variables = re.compile("^(?:\w|_)+\s+(?:\w|_|\[|\])+\s*=\s*.+;") keywords = re.compile("^\s*(?:using|class|struct)") constants = re.compile("^\s*(?:static\s+)?const") if not function and variables.search(code) and \ not keywords.search(code) and \ not constants.search(code): self.add_error(label="NON_CONST_GLOBAL")
def check_non_const_global(self, code): inside = Literal("int main") if len(inside.searchString(code)): self.outside_main = False elif self.outside_main: function = check_if_function(code) variables = variables = re.compile( r"^(?:\w|_)+\s+(?:\w|_|\[|\])+\s*=\s*.+;") keywords = re.compile(r"^\s*(?:using|class|struct)") constants = re.compile(r"^\s*(?:static\s+)?const") if not function and variables.search(code) and \ not keywords.search(code) and \ not constants.search(code): self.add_error(label="NON_CONST_GLOBAL")
def get_access_logs(config): """ Parse config for access_log directives :return: iterator over ('path', 'format name') tuple of found directives """ access_log = Literal("access_log") + ZeroOrMore(parameter) + semicolon access_log.ignore(pythonStyleComment) for directive in access_log.searchString(config).asList(): path = directive[1] if path == 'off' or path.startswith('syslog:'): # nothing to process here continue format_name = 'combined' if len(directive) > 2 and '=' not in directive[2]: format_name = directive[2] yield path, format_name
################# print("Example of an extractor") print("----------------------") # simple grammar to match #define's ident = Word(alphas, alphanums+"_") macroDef = Literal("#define") + ident.setResultsName("name") + "=" + restOfLine.setResultsName("value") for t,s,e in macroDef.scanString( testData ): print(t.name,":", t.value) # or a quick way to make a dictionary of the names and values # (return only key and value tokens, and construct dict from key-value pairs) # - empty ahead of restOfLine advances past leading whitespace, does implicit lstrip during parsing macroDef = Suppress("#define") + ident + Suppress("=") + empty + restOfLine macros = dict(list(macroDef.searchString(testData))) print("macros =", macros) print() ################# print("Examples of a transformer") print("----------------------") # convert C++ namespaces to mangled C-compatible names scopedIdent = ident + OneOrMore( Literal("::").suppress() + ident ) scopedIdent.setParseAction(lambda t: "_".join(t)) print("(replace namespace-scoped names with C-compatible names)") print(scopedIdent.transformString( testData ))
def parse_file(self): """Parses an existing namelist file and creates a deck of cards to hold the data. After this is executed, you need to call the ``load_model()`` method to extract the variables from this data structure.""" infile = open(self.filename, "r") data = infile.readlines() infile.close() # Lots of numerical tokens for recognizing various kinds of numbers digits = Word(nums) dot = "." sign = oneOf("+ -") ee = CaselessLiteral("E") | CaselessLiteral("D") num_int = ToInteger(Combine(Optional(sign) + digits)) num_float = ToFloat( Combine( Optional(sign) + ((digits + dot + Optional(digits)) | (dot + digits)) + Optional(ee + Optional(sign) + digits) ) ) # special case for a float written like "3e5" mixed_exp = ToFloat(Combine(digits + ee + Optional(sign) + digits)) # I don't suppose we need these, but just in case (plus it's easy) nan = ToFloat(oneOf("NaN Inf -Inf")) numval = num_float | mixed_exp | num_int | nan strval = QuotedString(quoteChar='"') | QuotedString(quoteChar="'") b_list = "T TRUE True true F FALSE False false .TRUE. .FALSE. .T. .F." boolval = ToBool(oneOf(b_list)) fieldval = Word(alphanums) # Tokens for parsing a line of data numstr_token = numval + ZeroOrMore(Suppress(",") + numval) | strval data_token = numstr_token | boolval index_token = Suppress("(") + num_int + Suppress(")") card_token = Group( fieldval("name") + Optional(index_token("index")) + Suppress("=") + Optional(num_int("dimension") + Suppress("*")) + data_token("value") + Optional(Suppress("*") + num_int("dimension")) ) multi_card_token = card_token + ZeroOrMore(Suppress(",") + card_token) array_continuation_token = numstr_token.setResultsName("value") array2D_token = ( fieldval("name") + Suppress("(") + Suppress(num_int) + Suppress(",") + num_int("index") + Suppress(")") + Suppress("=") + numval + ZeroOrMore(Suppress(",") + numval) ) # Tokens for parsing the group head and tai group_end_token = Literal("/") | Literal("$END") | Literal("$end") | Literal("&END") | Literal("&end") group_name_token = ( (Literal("$") | Literal("&")) + Word(alphanums).setResultsName("name") + Optional(multi_card_token) + Optional(group_end_token) ) # Comment Token comment_token = Literal("!") # Loop through each line and parse. current_group = None for line in data: line_base = line line = line.strip() # blank line: do nothing if not line: continue if current_group: # Skip comment cards if comment_token.searchString(line): pass # Process orindary cards elif multi_card_token.searchString(line): cards = multi_card_token.parseString(line) for card in cards: name, value = _process_card_info(card) self.cards[-1].append(Card(name, value)) # Catch 2D arrays like -> X(1,1) = 3,4,5 elif array2D_token.searchString(line): card = array2D_token.parseString(line) name = card[0] index = card[1] value = array(card[2:]) if index > 1: old_value = self.cards[-1][-1].value new_value = vstack((old_value, value)) self.cards[-1][-1].value = new_value else: self.cards[-1].append(Card(name, value)) # Arrays can be continued on subsequent lines # The value of the most recent card must be turned into an # array and appended elif array_continuation_token.searchString(line): card = array_continuation_token.parseString(line) if len(card) > 1: element = array(card[0:]) else: element = card.value if isinstance(self.cards[-1][-1].value, ndarray): new_value = append(self.cards[-1][-1].value, element) else: new_value = array([self.cards[-1][-1].value, element]) self.cards[-1][-1].value = new_value # Lastly, look for the group footer elif group_end_token.searchString(line): current_group = None # Everything else must be a pure comment # Group ending '/' can also conclude a data line. if line[-1] == "/": current_group = None else: group_name = group_name_token.searchString(line) # Group Header if group_name: group_name = group_name_token.parseString(line) current_group = group_name.name self.add_group(current_group) # Sometimes, variable definitions are included on the # same line as the namelist header if len(group_name) > 2: cards = group_name[2:] for card in cards: # Sometimes an end card is on the same line. if group_end_token.searchString(card): current_group = None else: name, value = _process_card_info(card) self.cards[-1].append(Card(name, value)) # If there is an ungrouped card at the start, take it as the # title for the analysis elif len(self.cards) == 0 and self.title == "": self.title = line # All other ungrouped cards are saved as free-form (card-less) # groups. # Note that we can't lstrip because column spacing might be # important. else: self.add_group(line_base.rstrip())
def parse_file(self): """Parses an existing namelist file and creates a deck of cards to hold the data. After this is executed, you need to call the ``load_model()`` method to extract the variables from this data structure.""" infile = open(self.filename, 'r') data = infile.readlines() infile.close() # Lots of numerical tokens for recognizing various kinds of numbers digits = Word(nums) dot = "." sign = oneOf("+ -") ee = CaselessLiteral('E') | CaselessLiteral('D') num_int = ToInteger(Combine(Optional(sign) + digits)) num_float = ToFloat( Combine( Optional(sign) + ((digits + dot + Optional(digits)) | (dot + digits)) + Optional(ee + Optional(sign) + digits))) # special case for a float written like "3e5" mixed_exp = ToFloat(Combine(digits + ee + Optional(sign) + digits)) # I don't suppose we need these, but just in case (plus it's easy) nan = ToFloat(oneOf("NaN Inf -Inf")) numval = num_float | mixed_exp | num_int | nan strval = QuotedString(quoteChar='"') | QuotedString(quoteChar="'") b_list = "T TRUE True true F FALSE False false .TRUE. .FALSE. .T. .F." boolval = ToBool(oneOf(b_list)) fieldval = Word(alphanums) # Tokens for parsing a line of data numstr_token = numval + ZeroOrMore(Suppress(',') + numval) \ | strval data_token = numstr_token | boolval index_token = Suppress('(') + num_int + Suppress(')') card_token = Group( fieldval("name") + Optional(index_token("index")) + Suppress('=') + Optional(num_int("dimension") + Suppress('*')) + data_token("value") + Optional(Suppress('*') + num_int("dimension"))) multi_card_token = (card_token + ZeroOrMore(Suppress(',') + card_token)) array_continuation_token = numstr_token.setResultsName("value") array2D_token = fieldval("name") + Suppress("(") + \ Suppress(num_int) + Suppress(',') + \ num_int("index") + Suppress(')') + \ Suppress('=') + numval + \ ZeroOrMore(Suppress(',') + numval) # Tokens for parsing the group head and tai group_end_token = Literal("/") | \ Literal("$END") | Literal("$end") | \ Literal("&END") | Literal("&end") group_name_token = (Literal("$") | Literal("&")) + \ Word(alphanums).setResultsName("name") + \ Optional(multi_card_token) + \ Optional(group_end_token) # Comment Token comment_token = Literal("!") # Loop through each line and parse. current_group = None for line in data: line_base = line line = line.strip() # blank line: do nothing if not line: continue if current_group: # Skip comment cards if comment_token.searchString(line): pass # Process orindary cards elif multi_card_token.searchString(line): cards = multi_card_token.parseString(line) for card in cards: name, value = _process_card_info(card) self.cards[-1].append(Card(name, value)) # Catch 2D arrays like -> X(1,1) = 3,4,5 elif array2D_token.searchString(line): card = array2D_token.parseString(line) name = card[0] index = card[1] value = array(card[2:]) if index > 1: old_value = self.cards[-1][-1].value new_value = vstack((old_value, value)) self.cards[-1][-1].value = new_value else: self.cards[-1].append(Card(name, value)) # Arrays can be continued on subsequent lines # The value of the most recent card must be turned into an # array and appended elif array_continuation_token.searchString(line): card = array_continuation_token.parseString(line) if len(card) > 1: element = array(card[0:]) else: element = card.value if isinstance(self.cards[-1][-1].value, ndarray): new_value = append(self.cards[-1][-1].value, element) else: new_value = array([self.cards[-1][-1].value, element]) self.cards[-1][-1].value = new_value # Lastly, look for the group footer elif group_end_token.searchString(line): current_group = None # Everything else must be a pure comment else: print "Comment ignored: %s" % line.rstrip('\n') # Group ending '/' can also conclude a data line. if line[-1] == '/': current_group = None #print self.cards[-1][-1].name, self.cards[-1][-1].value else: group_name = group_name_token.searchString(line) # Group Header if group_name: group_name = group_name_token.parseString(line) current_group = group_name.name self.add_group(current_group) # Sometimes, variable definitions are included on the # same line as the namelist header if len(group_name) > 2: cards = group_name[2:] for card in cards: # Sometimes an end card is on the same line. if group_end_token.searchString(card): current_group = None else: name, value = _process_card_info(card) self.cards[-1].append(Card(name, value)) # If there is an ungrouped card at the start, take it as the # title for the analysis elif len(self.cards) == 0 and self.title == '': self.title = line # All other ungrouped cards are saved as free-form (card-less) # groups. # Note that we can't lstrip because column spacing might be # important. else: self.add_group(line_base.rstrip())
def restscrape(resturl, filenamersc, filenamerevsc): time.sleep(randint(2,8)) # Read the url response = urllib2.urlopen(resturl) soup = BeautifulSoup(response.read()) response.close() # Check if it is rated if soup.find(itemprop="ratingValue") == None: return # Anamoly if soup.find(class_="container no-reviews") != None: return # Check if it is not the alternate version if soup.find(id="mapbox") != None: print "alt version" restscrape(resturl, filenamersc, filenamerevsc) return # Check if it is not an alternate version if soup.find(class_="friend-count miniOrange") == None: print "alt version rev" restscrape(resturl, filenamersc, filenamerevsc) return #### ## ## ######## ####### ## ### ## ## ## ## ## #### ## ## ## ## ## ## ## ## ###### ## ## ## ## #### ## ## ## ## ## ### ## ## ## #### ## ## ## ####### # Key Yelp information title = soup.find(property="og:title").get("content").encode('utf-8') latitude = soup.find(property="place:location:latitude").get("content") longitude = soup.find(property="place:location:longitude").get("content") rating = soup.find(itemprop="ratingValue").get("content") reviewCount = soup.find(itemprop="reviewCount").get_text() if soup.find(id="cat_display") != None: categories = soup.find(id="cat_display").get_text().strip() categories = ' '.join(categories.split()) else: categories = "None" if soup.find(class_="photo-box-img")['src'] != "http://s3-media1.ak.yelpcdn.com/assets/2/www/img/5f69f303f17c/default_avatars/business_medium_square.png": photos = "Has photos" else: photos = "None" if soup.find(id="bizUrl") != None: URL = soup.find(id="bizUrl").get_text().strip().encode('utf-8') else: URL = "None" # Get Neighborhoods # Particularly special code because it has to be stripped from javascript script # Automatically strip quotes from quoted strings # quotedString matches single or double quotes neighborhood = "" quotedString.setParseAction(removeQuotes) # Define a pattern to extract the neighborhoods: entry neighborhoodsSpec = Literal('\"neighborhoods\":') + '[' + delimitedList(quotedString)('neighborhoods') + ']' for hoods in neighborhoodsSpec.searchString(soup): neighborhood = str(hoods.neighborhoods) # Yelp Interaction/Information if soup.find(class_="yelp-menu") != None: menu = "Has menu" else: menu = "None" if soup.find(id="opentable-reservation-actions") != None: reservable = "Reservable" else: reservable = "None" if soup.find(class_="media-story offer-detail") != None: deal = "Has deal" else: deal = "None" if soup.find(id="delivery-address-form") != None: yelpDelivery = "Delivery system" else: yelpDelivery = "None" if soup.find(id="bizSlide") != None: slides = "Has slides" else: slides = "None" # Restaurant status if soup.find(id="bizSupporter") != None: sponsor = "Sponsors" else: sponsor = "None" if soup.find(id="bizClaim") != None: claim = "Unclaimed" else: claim = "None" if soup.find(style="color:#999999;") == None: eliteReviews = "Has Elites" else: eliteReviews = "None" # Restaurant attributes from attributes section # Attributes self-explanatory if soup.find(class_="attr-transit") != None: transit = soup.find(class_="attr-transit").get_text().strip() else: transit = "None" if soup.find(class_="attr-BusinessHours") != None: hours = soup.find('dd', class_="attr-BusinessHours").get_text() else: hours = "None" if soup.find(class_="attr-RestaurantsAttire") != None: attire = soup.find('dd', class_="attr-RestaurantsAttire").get_text() else: attire = "None" if soup.find(class_="attr-BusinessAcceptsCreditCards") != None: creditCards = soup.find('dd', class_="attr-BusinessAcceptsCreditCards").get_text() else: creditCards = "None" if soup.find(class_="attr-BusinessParking") != None: parking = soup.find('dd', class_="attr-BusinessParking").get_text() else: parking = "None" if soup.find(class_="attr-RestaurantsPriceRange2") != None: price = soup.find('dd', class_="attr-RestaurantsPriceRange2").get_text().strip() else: price = "None" if soup.find(class_="attr-RestaurantsGoodForGroups") != None: groups = soup.find('dd', class_="attr-RestaurantsGoodForGroups").get_text() else: groups = "None" if soup.find(class_="attr-GoodForKids") != None: kids = soup.find('dd', class_="attr-GoodForKids").get_text() else: kids = "None" if soup.find(class_="attr-RestaurantsReservations") != None: reservations = soup.find('dd', class_="attr-RestaurantsReservations").get_text() else: reservations = "None" if soup.find(class_="attr-RestaurantsDelivery") != None: delivery = soup.find('dd', class_="attr-RestaurantsDelivery").get_text() else: delivery = "None" if soup.find(class_="attr-RestaurantsTakeOut") != None: takeout = soup.find('dd', class_="attr-RestaurantsTakeOut").get_text() else: takeout = "None" if soup.find(class_="attr-RestaurantsTableService") != None: service = soup.find('dd', class_="attr-RestaurantsTableService").get_text() else: service = "None" if soup.find(class_="attr-OutdoorSeating") != None: outdoorSeating = soup.find('dd', class_="attr-OutdoorSeating").get_text() else: outdoorSeating = "None" if soup.find(class_="attr-WiFi") != None: wifi = soup.find('dd', class_="attr-WiFi").get_text() else: wifi = "None" if soup.find(class_="attr-GoodForMeal") != None: meals = soup.find('dd', class_="attr-GoodForMeal").get_text() else: meals = "None" if soup.find(class_="attr-BestNights") != None: bestNights = soup.find('dd', class_="attr-BestNights").get_text() else: bestNights = "None" if soup.find(class_="attr-HappyHour") != None: happyHour = soup.find('dd', class_="attr-HappyHour").get_text() else: happyHour = "None" if soup.find(class_="attr-Alcohol") != None: alcohol = soup.find('dd', class_="attr-Alcohol").get_text() else: alcohol = "None" if soup.find(class_="attr-Smoking") != None: smoking = soup.find('dd', class_="attr-Smoking").get_text() else: smoking = "None" if soup.find(class_="attr-CoatCheck") != None: coatCheck = soup.find('dd', class_="attr-CoatCheck").get_text() else: coatCheck = "None" if soup.find(class_="attr-NoiseLevel") != None: noise = soup.find('dd', class_="attr-NoiseLevel").get_text() else: noise = "None" if soup.find(class_="attr-GoodForDancing") != None: goodForDancing = soup.find('dd', class_="attr-GoodForDancing").get_text() else: goodForDancing = "None" if soup.find(class_="attr-Ambience") != None: ambience = soup.find('dd', class_="attr-Ambience").get_text() else: ambience = "None" if soup.find(class_="attr-HasTV") != None: tv = soup.find('dd', class_="attr-HasTV").get_text() else: tv = "None" if soup.find(class_="attr-Caters") != None: caters = soup.find('dd', class_="attr-Caters").get_text() else: caters = "None" if soup.find(class_="attr-WheelchairAccessible") != None: wheelchairAccessible = soup.find('dd', class_="attr-WheelchairAccessible").get_text() else: wheelchairAccessible = "None" if soup.find(class_="attr-DogsAllowed") != None: dogsAllowed = soup.find('dd', class_="attr-DogsAllowed").get_text() else: dogsAllowed = "None" with open(filenamersc, "ab") as filer: fr = csv.writer(filer) # Writing to CSV fr.writerow([resturl, title, latitude, longitude, rating, reviewCount, categories, photos, URL, neighborhood, menu, reservable, yelpDelivery, slides, sponsor, claim, eliteReviews, transit, hours, attire, creditCards, parking, price, groups, kids, reservations, deal, delivery, takeout, service, outdoorSeating, wifi, meals, bestNights, happyHour, alcohol, smoking, coatCheck, noise, goodForDancing, ambience, tv, caters, wheelchairAccessible]) ######## ######## ## ## #### ######## ## ## ###### ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ######## ###### ## ## ## ###### ## ## ## ###### ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ######## ### #### ######## ### ### ###### # Parsing top 40 Reviews reviews = soup.findAll(itemprop="review") for review in reviews: # Get user data if review.find(title="User is Elite") != None: eliteStatus = "Elite" else: eliteStatus = "None" friendCount = review.find(class_="friend-count miniOrange").get_text()[:-8].strip() reviewCount = review.find(class_="review-count miniOrange").get_text()[:-8].strip() if review.find(class_="photo-box-img")['src'] != "http://s3-media4.ak.yelpcdn.com/assets/2/www/img/78074914700f/default_avatars/user_small_square.png": userPhoto = "Has photo" else: userPhoto = "None" reviewInfo = review.find(class_="reviewer_info").get_text().encode('utf-8') # Get review data reviewRating = review.find(itemprop="ratingValue").get("content") publish = review.find(itemprop="datePublished").get("content") description = review.find(itemprop="description").get_text().encode('utf-8') # Get review attributes if review.find(class_="i-wrap ig-wrap-common i-camera-common-wrap badge photo-count") != None: reviewPix = review.find(class_="i-wrap ig-wrap-common i-camera-common-wrap badge photo-count").get_text()[:-6].strip() else: reviewPix = "None" if review.find(class_="i-wrap ig-wrap-common i-opentable-badge-common-wrap badge opentable-badge-marker") != None: reviewSeated = "Seated" else: reviewSeated = "None" if review.find(class_="i ig-common i-deal-price-tag-common") != None: reviewDeal = "Purchased Deal" else: reviewDeal = "None" if review.find(class_="i-wrap ig-wrap-common i-checkin-burst-blue-small-common-wrap badge checkin checkin-irregular") != None: reviewCheckIn = review.find(class_="i-wrap ig-wrap-common i-checkin-burst-blue-small-common-wrap badge checkin checkin-irregular").get_text()[:-14].strip() else: reviewCheckIn = "None" # Special Qype users lack stats if review.find(class_="count"): usefulfunnycool = review.findAll(class_="count") # Get useful, funny, cool statistics if usefulfunnycool[0].get_text() != "": useful = usefulfunnycool[0].get_text() else: useful = 0 if usefulfunnycool[1].get_text() != "": funny = usefulfunnycool[1].get_text() else: funny = 0 if usefulfunnycool[2].get_text() != "": cool = usefulfunnycool[2].get_text() else: cool = 0 else: useful = 0 funny = 0 cool = 0 with open(filenamerevsc, "ab") as filerev: frev = csv.writer(filerev) # Writing to CSV frev.writerow([resturl, eliteStatus, friendCount, reviewCount, userPhoto, reviewInfo, reviewRating, publish, description, reviewPix, reviewSeated, reviewDeal, reviewCheckIn, useful, funny, cool])
################# print("Example of an extractor") print("----------------------") # simple grammar to match #define's ident = Word(alphas, alphanums + "_") macroDef = Literal("#define") + ident.setResultsName( "name") + "=" + restOfLine.setResultsName("value") for t, s, e in macroDef.scanString(testData): print(t.name, ":", t.value) # or a quick way to make a dictionary of the names and values # (return only key and value tokens, and construct dict from key-value pairs) # - empty ahead of restOfLine advances past leading whitespace, does implicit lstrip during parsing macroDef = Suppress("#define") + ident + Suppress("=") + empty + restOfLine macros = dict(list(macroDef.searchString(testData))) print("macros =", macros) print() ################# print("Examples of a transformer") print("----------------------") # convert C++ namespaces to mangled C-compatible names scopedIdent = ident + OneOrMore(Literal("::").suppress() + ident) scopedIdent.setParseAction(lambda t: "_".join(t)) print("(replace namespace-scoped names with C-compatible names)") print(scopedIdent.transformString(testData))
def check_while_true(self, code): keyword = Literal("true") | Literal("1") statement_parser = Literal("while") + Literal("(") + keyword + Literal(")") if len(statement_parser.searchString(code)): self.add_error(label="WHILE_TRUE")
def check_while_true(self, code): statement_parser = Literal("while") + Literal("(") + Literal("true") + Literal(")") if len(statement_parser.searchString(code)): self.add_error(label="WHILE_TRUE")
def restscrape(resturl, filenamersc, filenamerevsc): time.sleep(randint(2, 8)) # Read the url response = urllib2.urlopen(resturl) soup = BeautifulSoup(response.read()) response.close() # Check if it is rated if soup.find(itemprop="ratingValue") == None: return # Anamoly if soup.find(class_="container no-reviews") != None: return # Check if it is not the alternate version if soup.find(id="mapbox") != None: print "alt version" restscrape(resturl, filenamersc, filenamerevsc) return # Check if it is not an alternate version if soup.find(class_="friend-count miniOrange") == None: print "alt version rev" restscrape(resturl, filenamersc, filenamerevsc) return #### ## ## ######## ####### ## ### ## ## ## ## ## #### ## ## ## ## ## ## ## ## ###### ## ## ## ## #### ## ## ## ## ## ### ## ## ## #### ## ## ## ####### # Key Yelp information title = soup.find(property="og:title").get("content").encode('utf-8') latitude = soup.find(property="place:location:latitude").get("content") longitude = soup.find(property="place:location:longitude").get("content") rating = soup.find(itemprop="ratingValue").get("content") reviewCount = soup.find(itemprop="reviewCount").get_text() if soup.find(id="cat_display") != None: categories = soup.find(id="cat_display").get_text().strip() categories = ' '.join(categories.split()) else: categories = "None" if soup.find( class_="photo-box-img" )['src'] != "http://s3-media1.ak.yelpcdn.com/assets/2/www/img/5f69f303f17c/default_avatars/business_medium_square.png": photos = "Has photos" else: photos = "None" if soup.find(id="bizUrl") != None: URL = soup.find(id="bizUrl").get_text().strip().encode('utf-8') else: URL = "None" # Get Neighborhoods # Particularly special code because it has to be stripped from javascript script # Automatically strip quotes from quoted strings # quotedString matches single or double quotes neighborhood = "" quotedString.setParseAction(removeQuotes) # Define a pattern to extract the neighborhoods: entry neighborhoodsSpec = Literal('\"neighborhoods\":') + '[' + delimitedList( quotedString)('neighborhoods') + ']' for hoods in neighborhoodsSpec.searchString(soup): neighborhood = str(hoods.neighborhoods) # Yelp Interaction/Information if soup.find(class_="yelp-menu") != None: menu = "Has menu" else: menu = "None" if soup.find(id="opentable-reservation-actions") != None: reservable = "Reservable" else: reservable = "None" if soup.find(class_="media-story offer-detail") != None: deal = "Has deal" else: deal = "None" if soup.find(id="delivery-address-form") != None: yelpDelivery = "Delivery system" else: yelpDelivery = "None" if soup.find(id="bizSlide") != None: slides = "Has slides" else: slides = "None" # Restaurant status if soup.find(id="bizSupporter") != None: sponsor = "Sponsors" else: sponsor = "None" if soup.find(id="bizClaim") != None: claim = "Unclaimed" else: claim = "None" if soup.find(style="color:#999999;") == None: eliteReviews = "Has Elites" else: eliteReviews = "None" # Restaurant attributes from attributes section # Attributes self-explanatory if soup.find(class_="attr-transit") != None: transit = soup.find(class_="attr-transit").get_text().strip() else: transit = "None" if soup.find(class_="attr-BusinessHours") != None: hours = soup.find('dd', class_="attr-BusinessHours").get_text() else: hours = "None" if soup.find(class_="attr-RestaurantsAttire") != None: attire = soup.find('dd', class_="attr-RestaurantsAttire").get_text() else: attire = "None" if soup.find(class_="attr-BusinessAcceptsCreditCards") != None: creditCards = soup.find( 'dd', class_="attr-BusinessAcceptsCreditCards").get_text() else: creditCards = "None" if soup.find(class_="attr-BusinessParking") != None: parking = soup.find('dd', class_="attr-BusinessParking").get_text() else: parking = "None" if soup.find(class_="attr-RestaurantsPriceRange2") != None: price = soup.find( 'dd', class_="attr-RestaurantsPriceRange2").get_text().strip() else: price = "None" if soup.find(class_="attr-RestaurantsGoodForGroups") != None: groups = soup.find('dd', class_="attr-RestaurantsGoodForGroups").get_text() else: groups = "None" if soup.find(class_="attr-GoodForKids") != None: kids = soup.find('dd', class_="attr-GoodForKids").get_text() else: kids = "None" if soup.find(class_="attr-RestaurantsReservations") != None: reservations = soup.find( 'dd', class_="attr-RestaurantsReservations").get_text() else: reservations = "None" if soup.find(class_="attr-RestaurantsDelivery") != None: delivery = soup.find('dd', class_="attr-RestaurantsDelivery").get_text() else: delivery = "None" if soup.find(class_="attr-RestaurantsTakeOut") != None: takeout = soup.find('dd', class_="attr-RestaurantsTakeOut").get_text() else: takeout = "None" if soup.find(class_="attr-RestaurantsTableService") != None: service = soup.find('dd', class_="attr-RestaurantsTableService").get_text() else: service = "None" if soup.find(class_="attr-OutdoorSeating") != None: outdoorSeating = soup.find('dd', class_="attr-OutdoorSeating").get_text() else: outdoorSeating = "None" if soup.find(class_="attr-WiFi") != None: wifi = soup.find('dd', class_="attr-WiFi").get_text() else: wifi = "None" if soup.find(class_="attr-GoodForMeal") != None: meals = soup.find('dd', class_="attr-GoodForMeal").get_text() else: meals = "None" if soup.find(class_="attr-BestNights") != None: bestNights = soup.find('dd', class_="attr-BestNights").get_text() else: bestNights = "None" if soup.find(class_="attr-HappyHour") != None: happyHour = soup.find('dd', class_="attr-HappyHour").get_text() else: happyHour = "None" if soup.find(class_="attr-Alcohol") != None: alcohol = soup.find('dd', class_="attr-Alcohol").get_text() else: alcohol = "None" if soup.find(class_="attr-Smoking") != None: smoking = soup.find('dd', class_="attr-Smoking").get_text() else: smoking = "None" if soup.find(class_="attr-CoatCheck") != None: coatCheck = soup.find('dd', class_="attr-CoatCheck").get_text() else: coatCheck = "None" if soup.find(class_="attr-NoiseLevel") != None: noise = soup.find('dd', class_="attr-NoiseLevel").get_text() else: noise = "None" if soup.find(class_="attr-GoodForDancing") != None: goodForDancing = soup.find('dd', class_="attr-GoodForDancing").get_text() else: goodForDancing = "None" if soup.find(class_="attr-Ambience") != None: ambience = soup.find('dd', class_="attr-Ambience").get_text() else: ambience = "None" if soup.find(class_="attr-HasTV") != None: tv = soup.find('dd', class_="attr-HasTV").get_text() else: tv = "None" if soup.find(class_="attr-Caters") != None: caters = soup.find('dd', class_="attr-Caters").get_text() else: caters = "None" if soup.find(class_="attr-WheelchairAccessible") != None: wheelchairAccessible = soup.find( 'dd', class_="attr-WheelchairAccessible").get_text() else: wheelchairAccessible = "None" if soup.find(class_="attr-DogsAllowed") != None: dogsAllowed = soup.find('dd', class_="attr-DogsAllowed").get_text() else: dogsAllowed = "None" with open(filenamersc, "ab") as filer: fr = csv.writer(filer) # Writing to CSV fr.writerow([ resturl, title, latitude, longitude, rating, reviewCount, categories, photos, URL, neighborhood, menu, reservable, yelpDelivery, slides, sponsor, claim, eliteReviews, transit, hours, attire, creditCards, parking, price, groups, kids, reservations, deal, delivery, takeout, service, outdoorSeating, wifi, meals, bestNights, happyHour, alcohol, smoking, coatCheck, noise, goodForDancing, ambience, tv, caters, wheelchairAccessible ]) ######## ######## ## ## #### ######## ## ## ###### ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ######## ###### ## ## ## ###### ## ## ## ###### ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ######## ### #### ######## ### ### ###### # Parsing top 40 Reviews reviews = soup.findAll(itemprop="review") for review in reviews: # Get user data if review.find(title="User is Elite") != None: eliteStatus = "Elite" else: eliteStatus = "None" friendCount = review.find( class_="friend-count miniOrange").get_text()[:-8].strip() reviewCount = review.find( class_="review-count miniOrange").get_text()[:-8].strip() if review.find( class_="photo-box-img" )['src'] != "http://s3-media4.ak.yelpcdn.com/assets/2/www/img/78074914700f/default_avatars/user_small_square.png": userPhoto = "Has photo" else: userPhoto = "None" reviewInfo = review.find( class_="reviewer_info").get_text().encode('utf-8') # Get review data reviewRating = review.find(itemprop="ratingValue").get("content") publish = review.find(itemprop="datePublished").get("content") description = review.find( itemprop="description").get_text().encode('utf-8') # Get review attributes if review.find( class_= "i-wrap ig-wrap-common i-camera-common-wrap badge photo-count" ) != None: reviewPix = review.find( class_= "i-wrap ig-wrap-common i-camera-common-wrap badge photo-count" ).get_text()[:-6].strip() else: reviewPix = "None" if review.find( class_= "i-wrap ig-wrap-common i-opentable-badge-common-wrap badge opentable-badge-marker" ) != None: reviewSeated = "Seated" else: reviewSeated = "None" if review.find(class_="i ig-common i-deal-price-tag-common") != None: reviewDeal = "Purchased Deal" else: reviewDeal = "None" if review.find( class_= "i-wrap ig-wrap-common i-checkin-burst-blue-small-common-wrap badge checkin checkin-irregular" ) != None: reviewCheckIn = review.find( class_= "i-wrap ig-wrap-common i-checkin-burst-blue-small-common-wrap badge checkin checkin-irregular" ).get_text()[:-14].strip() else: reviewCheckIn = "None" # Special Qype users lack stats if review.find(class_="count"): usefulfunnycool = review.findAll(class_="count") # Get useful, funny, cool statistics if usefulfunnycool[0].get_text() != "": useful = usefulfunnycool[0].get_text() else: useful = 0 if usefulfunnycool[1].get_text() != "": funny = usefulfunnycool[1].get_text() else: funny = 0 if usefulfunnycool[2].get_text() != "": cool = usefulfunnycool[2].get_text() else: cool = 0 else: useful = 0 funny = 0 cool = 0 with open(filenamerevsc, "ab") as filerev: frev = csv.writer(filerev) # Writing to CSV frev.writerow([ resturl, eliteStatus, friendCount, reviewCount, userPhoto, reviewInfo, reviewRating, publish, description, reviewPix, reviewSeated, reviewDeal, reviewCheckIn, useful, funny, cool ])