def check_function_def_above_main(self, code):
    prototype = check_if_function_prototype(code)
    function = check_if_function(code)
    inside = Literal("int main")
    if len(inside.searchString(code)):
        return
    elif function and not prototype and self.outside_main:
        function_regex = re.compile("^\s*(\w+)\s+(\w+)")
        match = function_regex.search(code)
        function_name = match.group(2) if match else "NOT_FOUND"
        self.add_error(label="DEFINITION_ABOVE_MAIN", data={'function': function_name})
 def check_main_prefix(self, code):
     #Return value for main is optional in C++11
     parser = Literal("main")+Literal("(")+SkipTo(Literal(")"))+Literal(")")+Literal("{")
     if len(parser.searchString(code)):
         main_prefix = Literal("main")+Literal("(")
         full_use = "int"+Word(alphanums)+","+"char*"+Word(alphanums)+"["+"]"+")"
         # 3 options for main() syntax
         if not len((main_prefix+Literal(")")).searchString(code)) and \
            not len((main_prefix+Literal("void")+Literal(")")).searchString(code)) and \
            not len((main_prefix+full_use).searchString(code)):
             self.add_error("MAIN_SYNTAX")
def check_main_syntax(self, code):
    # Return value for main is optional in C++11
    parser = Literal("int") + Literal("main") + Literal("(") + SkipTo(Literal(")")) + Literal(")")
    if len(parser.searchString(code)):
        main_prefix = Literal("int") + Literal("main") + Literal("(")
        full_use = Literal("int") + "argc" + "," + Optional("const") + "char" + "*" + "argv" + "[" + "]" + ")"
        # 3 options for main() syntax
        if not len((main_prefix + Literal(")")).searchString(code)) and \
                not len((main_prefix + Literal("void") + Literal(")")).searchString(code)) and \
                not len((main_prefix + full_use).searchString(code)):
            self.add_error(label="MAIN_SYNTAX")
Beispiel #4
0
def get_log_formats(config):
    """
    Parse config for log_format directives
    :return: iterator over ('format name', 'format string') tuple of found directives
    """
    # log_format name [params]
    log_format = Literal('log_format') + parameter + Group(OneOrMore(parameter)) + semicolon
    log_format.ignore(pythonStyleComment)

    for directive in log_format.searchString(config).asList():
        name = directive[1]
        format_string = ''.join(directive[2])
        yield name, format_string
Beispiel #5
0
def get_log_formats(config):
    """
    Parse config for log_format directives
    :return: iterator over ('format name', 'format string') tuple of found directives
    """
    # log_format name [params]
    log_format = Literal('log_format') + parameter + Group(OneOrMore(parameter)) + semicolon
    log_format.ignore(pythonStyleComment)

    for directive in log_format.searchString(config).asList():
        name = directive[1]
        format_string = ''.join(directive[2])
        yield name, format_string
Beispiel #6
0
def check_main_syntax(self, code):
    # Return value for main is optional in C++11
    parser = Literal("int") + Literal("main") + Literal("(") + SkipTo(
        Literal(")")) + Literal(")")
    if len(parser.searchString(code)):
        main_prefix = Literal("int") + Literal("main") + Literal("(")
        full_use = Literal("int") + "argc" + "," + Optional(
            "const") + "char" + "*" + "argv" + "[" + "]" + ")"
        # 3 options for main() syntax
        if not len((main_prefix + Literal(")")).searchString(code)) and \
                not len((main_prefix + Literal("void") + Literal(")")).searchString(code)) and \
                not len((main_prefix + full_use).searchString(code)):
            self.add_error(label="MAIN_SYNTAX")
def check_non_const_global(self, code):
    inside = Literal("int main")
    if len(inside.searchString(code)):
        self.outside_main = False

    if self.outside_main:
        function = check_if_function(code)
        variables = variables = re.compile("^(?:\w|_)+\s+(?:\w|_|\[|\])+\s*=\s*.+;")
        keywords = re.compile("^\s*(?:using|class|struct)")
        constants = re.compile("^\s*(?:static\s+)?const")
        if not function and variables.search(code) and \
                not keywords.search(code) and \
                not constants.search(code):
            self.add_error(label="NON_CONST_GLOBAL")
Beispiel #8
0
def check_non_const_global(self, code):
    inside = Literal("int main")
    if len(inside.searchString(code)):
        self.outside_main = False

    elif self.outside_main:
        function = check_if_function(code)
        variables = variables = re.compile(
            r"^(?:\w|_)+\s+(?:\w|_|\[|\])+\s*=\s*.+;")
        keywords = re.compile(r"^\s*(?:using|class|struct)")
        constants = re.compile(r"^\s*(?:static\s+)?const")
        if not function and variables.search(code) and \
                not keywords.search(code) and \
                not constants.search(code):
            self.add_error(label="NON_CONST_GLOBAL")
Beispiel #9
0
def get_access_logs(config):
    """
    Parse config for access_log directives
    :return: iterator over ('path', 'format name') tuple of found directives
    """
    access_log = Literal("access_log") + ZeroOrMore(parameter) + semicolon
    access_log.ignore(pythonStyleComment)

    for directive in access_log.searchString(config).asList():
        path = directive[1]
        if path == 'off' or path.startswith('syslog:'):
            # nothing to process here
            continue

        format_name = 'combined'
        if len(directive) > 2 and '=' not in directive[2]:
            format_name = directive[2]

        yield path, format_name
Beispiel #10
0
def get_access_logs(config):
    """
    Parse config for access_log directives
    :return: iterator over ('path', 'format name') tuple of found directives
    """
    access_log = Literal("access_log") + ZeroOrMore(parameter) + semicolon
    access_log.ignore(pythonStyleComment)

    for directive in access_log.searchString(config).asList():
        path = directive[1]
        if path == 'off' or path.startswith('syslog:'):
            # nothing to process here
            continue

        format_name = 'combined'
        if len(directive) > 2 and '=' not in directive[2]:
            format_name = directive[2]

        yield path, format_name
Beispiel #11
0
#################
print("Example of an extractor")
print("----------------------")

# simple grammar to match #define's
ident = Word(alphas, alphanums+"_")
macroDef = Literal("#define") + ident.setResultsName("name") + "=" + restOfLine.setResultsName("value")
for t,s,e in macroDef.scanString( testData ):
    print(t.name,":", t.value)
    
# or a quick way to make a dictionary of the names and values 
# (return only key and value tokens, and construct dict from key-value pairs)
# - empty ahead of restOfLine advances past leading whitespace, does implicit lstrip during parsing
macroDef = Suppress("#define") + ident + Suppress("=") + empty + restOfLine
macros = dict(list(macroDef.searchString(testData)))
print("macros =", macros)
print()


#################
print("Examples of a transformer")
print("----------------------")

# convert C++ namespaces to mangled C-compatible names
scopedIdent = ident + OneOrMore( Literal("::").suppress() + ident )
scopedIdent.setParseAction(lambda t: "_".join(t))

print("(replace namespace-scoped names with C-compatible names)")
print(scopedIdent.transformString( testData ))
    
Beispiel #12
0
    def parse_file(self):
        """Parses an existing namelist file and creates a deck of cards to
        hold the data. After this is executed, you need to call the ``load_model()``
        method to extract the variables from this data structure."""

        infile = open(self.filename, "r")
        data = infile.readlines()
        infile.close()

        # Lots of numerical tokens for recognizing various kinds of numbers
        digits = Word(nums)
        dot = "."
        sign = oneOf("+ -")
        ee = CaselessLiteral("E") | CaselessLiteral("D")

        num_int = ToInteger(Combine(Optional(sign) + digits))

        num_float = ToFloat(
            Combine(
                Optional(sign)
                + ((digits + dot + Optional(digits)) | (dot + digits))
                + Optional(ee + Optional(sign) + digits)
            )
        )

        # special case for a float written like "3e5"
        mixed_exp = ToFloat(Combine(digits + ee + Optional(sign) + digits))

        # I don't suppose we need these, but just in case (plus it's easy)
        nan = ToFloat(oneOf("NaN Inf -Inf"))

        numval = num_float | mixed_exp | num_int | nan
        strval = QuotedString(quoteChar='"') | QuotedString(quoteChar="'")
        b_list = "T TRUE True true F FALSE False false .TRUE. .FALSE. .T. .F."
        boolval = ToBool(oneOf(b_list))
        fieldval = Word(alphanums)

        # Tokens for parsing a line of data
        numstr_token = numval + ZeroOrMore(Suppress(",") + numval) | strval
        data_token = numstr_token | boolval
        index_token = Suppress("(") + num_int + Suppress(")")

        card_token = Group(
            fieldval("name")
            + Optional(index_token("index"))
            + Suppress("=")
            + Optional(num_int("dimension") + Suppress("*"))
            + data_token("value")
            + Optional(Suppress("*") + num_int("dimension"))
        )
        multi_card_token = card_token + ZeroOrMore(Suppress(",") + card_token)
        array_continuation_token = numstr_token.setResultsName("value")
        array2D_token = (
            fieldval("name")
            + Suppress("(")
            + Suppress(num_int)
            + Suppress(",")
            + num_int("index")
            + Suppress(")")
            + Suppress("=")
            + numval
            + ZeroOrMore(Suppress(",") + numval)
        )

        # Tokens for parsing the group head and tai
        group_end_token = Literal("/") | Literal("$END") | Literal("$end") | Literal("&END") | Literal("&end")
        group_name_token = (
            (Literal("$") | Literal("&"))
            + Word(alphanums).setResultsName("name")
            + Optional(multi_card_token)
            + Optional(group_end_token)
        )

        # Comment Token
        comment_token = Literal("!")

        # Loop through each line and parse.

        current_group = None
        for line in data:
            line_base = line
            line = line.strip()

            # blank line: do nothing
            if not line:
                continue

            if current_group:

                # Skip comment cards
                if comment_token.searchString(line):
                    pass

                # Process orindary cards
                elif multi_card_token.searchString(line):
                    cards = multi_card_token.parseString(line)

                    for card in cards:
                        name, value = _process_card_info(card)
                        self.cards[-1].append(Card(name, value))

                # Catch 2D arrays like -> X(1,1) = 3,4,5
                elif array2D_token.searchString(line):
                    card = array2D_token.parseString(line)

                    name = card[0]
                    index = card[1]
                    value = array(card[2:])

                    if index > 1:
                        old_value = self.cards[-1][-1].value
                        new_value = vstack((old_value, value))
                        self.cards[-1][-1].value = new_value
                    else:
                        self.cards[-1].append(Card(name, value))

                # Arrays can be continued on subsequent lines
                # The value of the most recent card must be turned into an
                # array and appended
                elif array_continuation_token.searchString(line):
                    card = array_continuation_token.parseString(line)

                    if len(card) > 1:
                        element = array(card[0:])
                    else:
                        element = card.value

                    if isinstance(self.cards[-1][-1].value, ndarray):
                        new_value = append(self.cards[-1][-1].value, element)
                    else:
                        new_value = array([self.cards[-1][-1].value, element])

                    self.cards[-1][-1].value = new_value

                # Lastly, look for the group footer
                elif group_end_token.searchString(line):
                    current_group = None

                # Everything else must be a pure comment

                # Group ending '/' can also conclude a data line.
                if line[-1] == "/":
                    current_group = None

            else:
                group_name = group_name_token.searchString(line)

                # Group Header
                if group_name:
                    group_name = group_name_token.parseString(line)
                    current_group = group_name.name
                    self.add_group(current_group)

                    # Sometimes, variable definitions are included on the
                    # same line as the namelist header
                    if len(group_name) > 2:
                        cards = group_name[2:]

                        for card in cards:
                            # Sometimes an end card is on the same line.
                            if group_end_token.searchString(card):
                                current_group = None
                            else:
                                name, value = _process_card_info(card)
                                self.cards[-1].append(Card(name, value))

                # If there is an ungrouped card at the start, take it as the
                # title for the analysis
                elif len(self.cards) == 0 and self.title == "":
                    self.title = line

                # All other ungrouped cards are saved as free-form (card-less)
                # groups.
                # Note that we can't lstrip because column spacing might be
                # important.
                else:
                    self.add_group(line_base.rstrip())
Beispiel #13
0
    def parse_file(self):
        """Parses an existing namelist file and creates a deck of cards to
        hold the data. After this is executed, you need to call the ``load_model()``
        method to extract the variables from this data structure."""

        infile = open(self.filename, 'r')
        data = infile.readlines()
        infile.close()

        # Lots of numerical tokens for recognizing various kinds of numbers
        digits = Word(nums)
        dot = "."
        sign = oneOf("+ -")
        ee = CaselessLiteral('E') | CaselessLiteral('D')

        num_int = ToInteger(Combine(Optional(sign) + digits))

        num_float = ToFloat(
            Combine(
                Optional(sign) +
                ((digits + dot + Optional(digits)) | (dot + digits)) +
                Optional(ee + Optional(sign) + digits)))

        # special case for a float written like "3e5"
        mixed_exp = ToFloat(Combine(digits + ee + Optional(sign) + digits))

        # I don't suppose we need these, but just in case (plus it's easy)
        nan = ToFloat(oneOf("NaN Inf -Inf"))

        numval = num_float | mixed_exp | num_int | nan
        strval = QuotedString(quoteChar='"') | QuotedString(quoteChar="'")
        b_list = "T TRUE True true F FALSE False false .TRUE. .FALSE. .T. .F."
        boolval = ToBool(oneOf(b_list))
        fieldval = Word(alphanums)

        # Tokens for parsing a line of data
        numstr_token = numval + ZeroOrMore(Suppress(',') + numval) \
                   | strval
        data_token = numstr_token | boolval
        index_token = Suppress('(') + num_int + Suppress(')')

        card_token = Group(
            fieldval("name") + Optional(index_token("index")) + Suppress('=') +
            Optional(num_int("dimension") + Suppress('*')) +
            data_token("value") +
            Optional(Suppress('*') + num_int("dimension")))
        multi_card_token = (card_token +
                            ZeroOrMore(Suppress(',') + card_token))
        array_continuation_token = numstr_token.setResultsName("value")
        array2D_token = fieldval("name") + Suppress("(") + \
                        Suppress(num_int) + Suppress(',') + \
                        num_int("index") + Suppress(')') + \
                        Suppress('=') + numval + \
                        ZeroOrMore(Suppress(',') + numval)

        # Tokens for parsing the group head and tai
        group_end_token = Literal("/") | \
                          Literal("$END") | Literal("$end") | \
                          Literal("&END") | Literal("&end")
        group_name_token = (Literal("$") | Literal("&")) + \
                           Word(alphanums).setResultsName("name") + \
                           Optional(multi_card_token) + \
                           Optional(group_end_token)

        # Comment Token
        comment_token = Literal("!")

        # Loop through each line and parse.

        current_group = None
        for line in data:
            line_base = line
            line = line.strip()

            # blank line: do nothing
            if not line:
                continue

            if current_group:

                # Skip comment cards
                if comment_token.searchString(line):
                    pass

                # Process orindary cards
                elif multi_card_token.searchString(line):
                    cards = multi_card_token.parseString(line)

                    for card in cards:
                        name, value = _process_card_info(card)
                        self.cards[-1].append(Card(name, value))

                # Catch 2D arrays like -> X(1,1) = 3,4,5
                elif array2D_token.searchString(line):
                    card = array2D_token.parseString(line)

                    name = card[0]
                    index = card[1]
                    value = array(card[2:])

                    if index > 1:
                        old_value = self.cards[-1][-1].value
                        new_value = vstack((old_value, value))
                        self.cards[-1][-1].value = new_value
                    else:
                        self.cards[-1].append(Card(name, value))

                # Arrays can be continued on subsequent lines
                # The value of the most recent card must be turned into an
                # array and appended
                elif array_continuation_token.searchString(line):
                    card = array_continuation_token.parseString(line)

                    if len(card) > 1:
                        element = array(card[0:])
                    else:
                        element = card.value

                    if isinstance(self.cards[-1][-1].value, ndarray):
                        new_value = append(self.cards[-1][-1].value, element)
                    else:
                        new_value = array([self.cards[-1][-1].value, element])

                    self.cards[-1][-1].value = new_value

                # Lastly, look for the group footer
                elif group_end_token.searchString(line):
                    current_group = None

                # Everything else must be a pure comment
                else:
                    print "Comment ignored: %s" % line.rstrip('\n')

                # Group ending '/' can also conclude a data line.
                if line[-1] == '/':
                    current_group = None

                #print self.cards[-1][-1].name, self.cards[-1][-1].value
            else:
                group_name = group_name_token.searchString(line)

                # Group Header
                if group_name:
                    group_name = group_name_token.parseString(line)
                    current_group = group_name.name
                    self.add_group(current_group)

                    # Sometimes, variable definitions are included on the
                    # same line as the namelist header
                    if len(group_name) > 2:
                        cards = group_name[2:]

                        for card in cards:
                            # Sometimes an end card is on the same line.
                            if group_end_token.searchString(card):
                                current_group = None
                            else:
                                name, value = _process_card_info(card)
                                self.cards[-1].append(Card(name, value))

                # If there is an ungrouped card at the start, take it as the
                # title for the analysis
                elif len(self.cards) == 0 and self.title == '':
                    self.title = line

                # All other ungrouped cards are saved as free-form (card-less)
                # groups.
                # Note that we can't lstrip because column spacing might be
                # important.
                else:
                    self.add_group(line_base.rstrip())
def restscrape(resturl, filenamersc, filenamerevsc):

    time.sleep(randint(2,8))
    # Read the url
    response = urllib2.urlopen(resturl)
    soup = BeautifulSoup(response.read())
    response.close()


    # Check if it is rated
    if soup.find(itemprop="ratingValue") == None:
        return

    # Anamoly
    if soup.find(class_="container no-reviews") != None:
        return

    # Check if it is not the alternate version
    if soup.find(id="mapbox") != None:
        print "alt version"
        restscrape(resturl, filenamersc, filenamerevsc)
        return

    # Check if it is not an alternate version
    if soup.find(class_="friend-count miniOrange") == None:
        print "alt version rev"
        restscrape(resturl, filenamersc, filenamerevsc)
        return

#### ##    ## ########  #######  
 ##  ###   ## ##       ##     ## 
 ##  ####  ## ##       ##     ## 
 ##  ## ## ## ######   ##     ## 
 ##  ##  #### ##       ##     ## 
 ##  ##   ### ##       ##     ## 
#### ##    ## ##        #######  

    # Key Yelp information
    title = soup.find(property="og:title").get("content").encode('utf-8')
    latitude = soup.find(property="place:location:latitude").get("content")
    longitude = soup.find(property="place:location:longitude").get("content")
    rating = soup.find(itemprop="ratingValue").get("content")
    reviewCount = soup.find(itemprop="reviewCount").get_text()

    if soup.find(id="cat_display") != None:
        categories = soup.find(id="cat_display").get_text().strip()
        categories = ' '.join(categories.split())
    else:
        categories = "None"

    if soup.find(class_="photo-box-img")['src'] != "http://s3-media1.ak.yelpcdn.com/assets/2/www/img/5f69f303f17c/default_avatars/business_medium_square.png":
        photos = "Has photos"
    else:
        photos = "None"

    if soup.find(id="bizUrl") != None:
         URL = soup.find(id="bizUrl").get_text().strip().encode('utf-8')
    else:
        URL = "None"

    # Get Neighborhoods
    # Particularly special code because it has to be stripped from javascript script
    # Automatically strip quotes from quoted strings
    # quotedString matches single or double quotes
    neighborhood = ""
    quotedString.setParseAction(removeQuotes)

    # Define a pattern to extract the neighborhoods: entry
    neighborhoodsSpec = Literal('\"neighborhoods\":') + '[' + delimitedList(quotedString)('neighborhoods') + ']'

    for hoods in neighborhoodsSpec.searchString(soup):
        neighborhood = str(hoods.neighborhoods)


    # Yelp Interaction/Information
    if soup.find(class_="yelp-menu") != None:
        menu = "Has menu"
    else:
        menu = "None"

    if soup.find(id="opentable-reservation-actions") != None:
        reservable = "Reservable"
    else:
        reservable = "None"

    if soup.find(class_="media-story offer-detail") != None:
        deal = "Has deal"
    else:
        deal = "None"
        
    if soup.find(id="delivery-address-form") != None:
        yelpDelivery = "Delivery system"
    else:
        yelpDelivery = "None"        

    if soup.find(id="bizSlide") != None:
        slides = "Has slides"
    else:
        slides = "None"


    # Restaurant status
    if soup.find(id="bizSupporter") != None:
        sponsor = "Sponsors"
    else:
        sponsor = "None"

    if soup.find(id="bizClaim") != None:
        claim = "Unclaimed"
    else:
        claim = "None"

    if soup.find(style="color:#999999;") == None:
        eliteReviews = "Has Elites"
    else:
        eliteReviews = "None"


    # Restaurant attributes from attributes section
    # Attributes self-explanatory
    if soup.find(class_="attr-transit") != None:
        transit = soup.find(class_="attr-transit").get_text().strip()
    else:
        transit = "None"

    if soup.find(class_="attr-BusinessHours") != None:
        hours = soup.find('dd', class_="attr-BusinessHours").get_text()
    else:
        hours = "None"

    if soup.find(class_="attr-RestaurantsAttire") != None:
        attire = soup.find('dd', class_="attr-RestaurantsAttire").get_text()
    else:
        attire = "None"

    if soup.find(class_="attr-BusinessAcceptsCreditCards") != None:
        creditCards = soup.find('dd', class_="attr-BusinessAcceptsCreditCards").get_text()
    else:
        creditCards = "None"

    if soup.find(class_="attr-BusinessParking") != None:
        parking = soup.find('dd', class_="attr-BusinessParking").get_text()
    else:
        parking = "None"

    if soup.find(class_="attr-RestaurantsPriceRange2") != None:
        price = soup.find('dd', class_="attr-RestaurantsPriceRange2").get_text().strip()
    else:
        price = "None"

    if soup.find(class_="attr-RestaurantsGoodForGroups") != None:
        groups = soup.find('dd', class_="attr-RestaurantsGoodForGroups").get_text()
    else:
        groups = "None"

    if soup.find(class_="attr-GoodForKids") != None:
        kids = soup.find('dd', class_="attr-GoodForKids").get_text()
    else:
        kids = "None"

    if soup.find(class_="attr-RestaurantsReservations") != None:
        reservations = soup.find('dd', class_="attr-RestaurantsReservations").get_text()
    else:
        reservations = "None"

    if soup.find(class_="attr-RestaurantsDelivery") != None:
        delivery = soup.find('dd', class_="attr-RestaurantsDelivery").get_text()
    else:
        delivery = "None"

    if soup.find(class_="attr-RestaurantsTakeOut") != None:
        takeout = soup.find('dd', class_="attr-RestaurantsTakeOut").get_text()
    else:
        takeout = "None"

    if soup.find(class_="attr-RestaurantsTableService") != None:
        service = soup.find('dd', class_="attr-RestaurantsTableService").get_text()
    else:
        service = "None"

    if soup.find(class_="attr-OutdoorSeating") != None:
        outdoorSeating = soup.find('dd', class_="attr-OutdoorSeating").get_text()
    else:
        outdoorSeating = "None"

    if soup.find(class_="attr-WiFi") != None:
        wifi = soup.find('dd', class_="attr-WiFi").get_text()
    else:
        wifi = "None"

    if soup.find(class_="attr-GoodForMeal") != None:
        meals = soup.find('dd', class_="attr-GoodForMeal").get_text()
    else:
        meals = "None"

    if soup.find(class_="attr-BestNights") != None:
        bestNights = soup.find('dd', class_="attr-BestNights").get_text()
    else:
        bestNights = "None"

    if soup.find(class_="attr-HappyHour") != None:
        happyHour = soup.find('dd', class_="attr-HappyHour").get_text()
    else:
        happyHour = "None"

    if soup.find(class_="attr-Alcohol") != None:
        alcohol = soup.find('dd', class_="attr-Alcohol").get_text()
    else:
        alcohol = "None"

    if soup.find(class_="attr-Smoking") != None:
        smoking = soup.find('dd', class_="attr-Smoking").get_text()
    else:
        smoking = "None"

    if soup.find(class_="attr-CoatCheck") != None:
        coatCheck = soup.find('dd', class_="attr-CoatCheck").get_text()
    else:
        coatCheck = "None"        

    if soup.find(class_="attr-NoiseLevel") != None:
        noise = soup.find('dd', class_="attr-NoiseLevel").get_text()
    else:
        noise = "None"

    if soup.find(class_="attr-GoodForDancing") != None:
        goodForDancing = soup.find('dd', class_="attr-GoodForDancing").get_text()
    else:
        goodForDancing = "None"

    if soup.find(class_="attr-Ambience") != None:
        ambience = soup.find('dd', class_="attr-Ambience").get_text()
    else:
        ambience = "None"

    if soup.find(class_="attr-HasTV") != None:
        tv = soup.find('dd', class_="attr-HasTV").get_text()
    else:
        tv = "None"

    if soup.find(class_="attr-Caters") != None:
        caters = soup.find('dd', class_="attr-Caters").get_text()
    else:
        caters = "None"

    if soup.find(class_="attr-WheelchairAccessible") != None:
        wheelchairAccessible = soup.find('dd', class_="attr-WheelchairAccessible").get_text()
    else:
        wheelchairAccessible = "None"

    if soup.find(class_="attr-DogsAllowed") != None:
        dogsAllowed = soup.find('dd', class_="attr-DogsAllowed").get_text()
    else:
        dogsAllowed = "None"


    with open(filenamersc, "ab") as filer:
        fr = csv.writer(filer)
        # Writing to CSV
        fr.writerow([resturl, title, latitude, longitude, rating, reviewCount, categories, photos, URL, neighborhood, menu, reservable, yelpDelivery, slides, sponsor, claim, eliteReviews, transit, hours, attire, creditCards, parking, price, groups, kids, reservations, deal, delivery, takeout, service, outdoorSeating, wifi, meals, bestNights, happyHour, alcohol, smoking, coatCheck, noise, goodForDancing, ambience, tv, caters, wheelchairAccessible])

########  ######## ##     ## #### ######## ##      ##  ######  
##     ## ##       ##     ##  ##  ##       ##  ##  ## ##    ## 
##     ## ##       ##     ##  ##  ##       ##  ##  ## ##       
########  ######   ##     ##  ##  ######   ##  ##  ##  ######  
##   ##   ##        ##   ##   ##  ##       ##  ##  ##       ## 
##    ##  ##         ## ##    ##  ##       ##  ##  ## ##    ## 
##     ## ########    ###    #### ########  ###  ###   ######  

    # Parsing top 40 Reviews
    reviews = soup.findAll(itemprop="review")
    for review in reviews:
        
        # Get user data
        if review.find(title="User is Elite") != None:
            eliteStatus = "Elite"
        else:
            eliteStatus = "None"

        friendCount = review.find(class_="friend-count miniOrange").get_text()[:-8].strip()
        reviewCount = review.find(class_="review-count miniOrange").get_text()[:-8].strip()

        if review.find(class_="photo-box-img")['src'] != "http://s3-media4.ak.yelpcdn.com/assets/2/www/img/78074914700f/default_avatars/user_small_square.png":
            userPhoto = "Has photo"
        else:
            userPhoto = "None"

        reviewInfo = review.find(class_="reviewer_info").get_text().encode('utf-8')


        # Get review data
        reviewRating = review.find(itemprop="ratingValue").get("content")
        publish = review.find(itemprop="datePublished").get("content")
        description = review.find(itemprop="description").get_text().encode('utf-8')


        # Get review attributes
        if review.find(class_="i-wrap ig-wrap-common i-camera-common-wrap badge photo-count") != None:
            reviewPix = review.find(class_="i-wrap ig-wrap-common i-camera-common-wrap badge photo-count").get_text()[:-6].strip()
        else:
            reviewPix = "None"

        if review.find(class_="i-wrap ig-wrap-common i-opentable-badge-common-wrap badge opentable-badge-marker") != None:
            reviewSeated = "Seated"
        else:
            reviewSeated = "None"

        if review.find(class_="i ig-common i-deal-price-tag-common") != None:
            reviewDeal = "Purchased Deal"
        else:
            reviewDeal = "None"

        if review.find(class_="i-wrap ig-wrap-common i-checkin-burst-blue-small-common-wrap badge checkin checkin-irregular") != None:
            reviewCheckIn = review.find(class_="i-wrap ig-wrap-common i-checkin-burst-blue-small-common-wrap badge checkin checkin-irregular").get_text()[:-14].strip()
        else:
            reviewCheckIn = "None"


        # Special Qype users lack stats
        if review.find(class_="count"):
            usefulfunnycool = review.findAll(class_="count")
            # Get useful, funny, cool statistics
            if usefulfunnycool[0].get_text() != "":
                useful = usefulfunnycool[0].get_text()
            else:
                useful = 0

            if usefulfunnycool[1].get_text() != "":
                funny = usefulfunnycool[1].get_text()
            else:
                funny = 0

            if usefulfunnycool[2].get_text() != "":
                cool = usefulfunnycool[2].get_text()
            else:
                cool = 0
        else:
            useful = 0
            funny = 0
            cool = 0

        with open(filenamerevsc, "ab") as filerev:
            frev = csv.writer(filerev)
            # Writing to CSV
            frev.writerow([resturl, eliteStatus, friendCount, reviewCount, userPhoto, reviewInfo, reviewRating, publish, description, reviewPix, reviewSeated, reviewDeal, reviewCheckIn, useful, funny, cool])
Beispiel #15
0
#################
print("Example of an extractor")
print("----------------------")

# simple grammar to match #define's
ident = Word(alphas, alphanums + "_")
macroDef = Literal("#define") + ident.setResultsName(
    "name") + "=" + restOfLine.setResultsName("value")
for t, s, e in macroDef.scanString(testData):
    print(t.name, ":", t.value)

# or a quick way to make a dictionary of the names and values
# (return only key and value tokens, and construct dict from key-value pairs)
# - empty ahead of restOfLine advances past leading whitespace, does implicit lstrip during parsing
macroDef = Suppress("#define") + ident + Suppress("=") + empty + restOfLine
macros = dict(list(macroDef.searchString(testData)))
print("macros =", macros)
print()

#################
print("Examples of a transformer")
print("----------------------")

# convert C++ namespaces to mangled C-compatible names
scopedIdent = ident + OneOrMore(Literal("::").suppress() + ident)
scopedIdent.setParseAction(lambda t: "_".join(t))

print("(replace namespace-scoped names with C-compatible names)")
print(scopedIdent.transformString(testData))

Beispiel #16
0
def check_while_true(self, code):
    keyword = Literal("true") | Literal("1")

    statement_parser = Literal("while") + Literal("(") + keyword + Literal(")")
    if len(statement_parser.searchString(code)):
        self.add_error(label="WHILE_TRUE")
Beispiel #17
0
def check_while_true(self, code):
    statement_parser = Literal("while") + Literal("(") + Literal("true") + Literal(")")
    if len(statement_parser.searchString(code)):
        self.add_error(label="WHILE_TRUE")
def restscrape(resturl, filenamersc, filenamerevsc):

    time.sleep(randint(2, 8))
    # Read the url
    response = urllib2.urlopen(resturl)
    soup = BeautifulSoup(response.read())
    response.close()

    # Check if it is rated
    if soup.find(itemprop="ratingValue") == None:
        return

    # Anamoly
    if soup.find(class_="container no-reviews") != None:
        return

    # Check if it is not the alternate version
    if soup.find(id="mapbox") != None:
        print "alt version"
        restscrape(resturl, filenamersc, filenamerevsc)
        return

    # Check if it is not an alternate version
    if soup.find(class_="friend-count miniOrange") == None:
        print "alt version rev"
        restscrape(resturl, filenamersc, filenamerevsc)
        return

#### ##    ## ########  #######
##  ###   ## ##       ##     ##
##  ####  ## ##       ##     ##
##  ## ## ## ######   ##     ##
##  ##  #### ##       ##     ##
##  ##   ### ##       ##     ##
#### ##    ## ##        #######

# Key Yelp information
    title = soup.find(property="og:title").get("content").encode('utf-8')
    latitude = soup.find(property="place:location:latitude").get("content")
    longitude = soup.find(property="place:location:longitude").get("content")
    rating = soup.find(itemprop="ratingValue").get("content")
    reviewCount = soup.find(itemprop="reviewCount").get_text()

    if soup.find(id="cat_display") != None:
        categories = soup.find(id="cat_display").get_text().strip()
        categories = ' '.join(categories.split())
    else:
        categories = "None"

    if soup.find(
            class_="photo-box-img"
    )['src'] != "http://s3-media1.ak.yelpcdn.com/assets/2/www/img/5f69f303f17c/default_avatars/business_medium_square.png":
        photos = "Has photos"
    else:
        photos = "None"

    if soup.find(id="bizUrl") != None:
        URL = soup.find(id="bizUrl").get_text().strip().encode('utf-8')
    else:
        URL = "None"

    # Get Neighborhoods
    # Particularly special code because it has to be stripped from javascript script
    # Automatically strip quotes from quoted strings
    # quotedString matches single or double quotes
    neighborhood = ""
    quotedString.setParseAction(removeQuotes)

    # Define a pattern to extract the neighborhoods: entry
    neighborhoodsSpec = Literal('\"neighborhoods\":') + '[' + delimitedList(
        quotedString)('neighborhoods') + ']'

    for hoods in neighborhoodsSpec.searchString(soup):
        neighborhood = str(hoods.neighborhoods)

    # Yelp Interaction/Information
    if soup.find(class_="yelp-menu") != None:
        menu = "Has menu"
    else:
        menu = "None"

    if soup.find(id="opentable-reservation-actions") != None:
        reservable = "Reservable"
    else:
        reservable = "None"

    if soup.find(class_="media-story offer-detail") != None:
        deal = "Has deal"
    else:
        deal = "None"

    if soup.find(id="delivery-address-form") != None:
        yelpDelivery = "Delivery system"
    else:
        yelpDelivery = "None"

    if soup.find(id="bizSlide") != None:
        slides = "Has slides"
    else:
        slides = "None"

    # Restaurant status
    if soup.find(id="bizSupporter") != None:
        sponsor = "Sponsors"
    else:
        sponsor = "None"

    if soup.find(id="bizClaim") != None:
        claim = "Unclaimed"
    else:
        claim = "None"

    if soup.find(style="color:#999999;") == None:
        eliteReviews = "Has Elites"
    else:
        eliteReviews = "None"

    # Restaurant attributes from attributes section
    # Attributes self-explanatory
    if soup.find(class_="attr-transit") != None:
        transit = soup.find(class_="attr-transit").get_text().strip()
    else:
        transit = "None"

    if soup.find(class_="attr-BusinessHours") != None:
        hours = soup.find('dd', class_="attr-BusinessHours").get_text()
    else:
        hours = "None"

    if soup.find(class_="attr-RestaurantsAttire") != None:
        attire = soup.find('dd', class_="attr-RestaurantsAttire").get_text()
    else:
        attire = "None"

    if soup.find(class_="attr-BusinessAcceptsCreditCards") != None:
        creditCards = soup.find(
            'dd', class_="attr-BusinessAcceptsCreditCards").get_text()
    else:
        creditCards = "None"

    if soup.find(class_="attr-BusinessParking") != None:
        parking = soup.find('dd', class_="attr-BusinessParking").get_text()
    else:
        parking = "None"

    if soup.find(class_="attr-RestaurantsPriceRange2") != None:
        price = soup.find(
            'dd', class_="attr-RestaurantsPriceRange2").get_text().strip()
    else:
        price = "None"

    if soup.find(class_="attr-RestaurantsGoodForGroups") != None:
        groups = soup.find('dd',
                           class_="attr-RestaurantsGoodForGroups").get_text()
    else:
        groups = "None"

    if soup.find(class_="attr-GoodForKids") != None:
        kids = soup.find('dd', class_="attr-GoodForKids").get_text()
    else:
        kids = "None"

    if soup.find(class_="attr-RestaurantsReservations") != None:
        reservations = soup.find(
            'dd', class_="attr-RestaurantsReservations").get_text()
    else:
        reservations = "None"

    if soup.find(class_="attr-RestaurantsDelivery") != None:
        delivery = soup.find('dd',
                             class_="attr-RestaurantsDelivery").get_text()
    else:
        delivery = "None"

    if soup.find(class_="attr-RestaurantsTakeOut") != None:
        takeout = soup.find('dd', class_="attr-RestaurantsTakeOut").get_text()
    else:
        takeout = "None"

    if soup.find(class_="attr-RestaurantsTableService") != None:
        service = soup.find('dd',
                            class_="attr-RestaurantsTableService").get_text()
    else:
        service = "None"

    if soup.find(class_="attr-OutdoorSeating") != None:
        outdoorSeating = soup.find('dd',
                                   class_="attr-OutdoorSeating").get_text()
    else:
        outdoorSeating = "None"

    if soup.find(class_="attr-WiFi") != None:
        wifi = soup.find('dd', class_="attr-WiFi").get_text()
    else:
        wifi = "None"

    if soup.find(class_="attr-GoodForMeal") != None:
        meals = soup.find('dd', class_="attr-GoodForMeal").get_text()
    else:
        meals = "None"

    if soup.find(class_="attr-BestNights") != None:
        bestNights = soup.find('dd', class_="attr-BestNights").get_text()
    else:
        bestNights = "None"

    if soup.find(class_="attr-HappyHour") != None:
        happyHour = soup.find('dd', class_="attr-HappyHour").get_text()
    else:
        happyHour = "None"

    if soup.find(class_="attr-Alcohol") != None:
        alcohol = soup.find('dd', class_="attr-Alcohol").get_text()
    else:
        alcohol = "None"

    if soup.find(class_="attr-Smoking") != None:
        smoking = soup.find('dd', class_="attr-Smoking").get_text()
    else:
        smoking = "None"

    if soup.find(class_="attr-CoatCheck") != None:
        coatCheck = soup.find('dd', class_="attr-CoatCheck").get_text()
    else:
        coatCheck = "None"

    if soup.find(class_="attr-NoiseLevel") != None:
        noise = soup.find('dd', class_="attr-NoiseLevel").get_text()
    else:
        noise = "None"

    if soup.find(class_="attr-GoodForDancing") != None:
        goodForDancing = soup.find('dd',
                                   class_="attr-GoodForDancing").get_text()
    else:
        goodForDancing = "None"

    if soup.find(class_="attr-Ambience") != None:
        ambience = soup.find('dd', class_="attr-Ambience").get_text()
    else:
        ambience = "None"

    if soup.find(class_="attr-HasTV") != None:
        tv = soup.find('dd', class_="attr-HasTV").get_text()
    else:
        tv = "None"

    if soup.find(class_="attr-Caters") != None:
        caters = soup.find('dd', class_="attr-Caters").get_text()
    else:
        caters = "None"

    if soup.find(class_="attr-WheelchairAccessible") != None:
        wheelchairAccessible = soup.find(
            'dd', class_="attr-WheelchairAccessible").get_text()
    else:
        wheelchairAccessible = "None"

    if soup.find(class_="attr-DogsAllowed") != None:
        dogsAllowed = soup.find('dd', class_="attr-DogsAllowed").get_text()
    else:
        dogsAllowed = "None"

    with open(filenamersc, "ab") as filer:
        fr = csv.writer(filer)
        # Writing to CSV
        fr.writerow([
            resturl, title, latitude, longitude, rating, reviewCount,
            categories, photos, URL, neighborhood, menu, reservable,
            yelpDelivery, slides, sponsor, claim, eliteReviews, transit, hours,
            attire, creditCards, parking, price, groups, kids, reservations,
            deal, delivery, takeout, service, outdoorSeating, wifi, meals,
            bestNights, happyHour, alcohol, smoking, coatCheck, noise,
            goodForDancing, ambience, tv, caters, wheelchairAccessible
        ])

########  ######## ##     ## #### ######## ##      ##  ######
##     ## ##       ##     ##  ##  ##       ##  ##  ## ##    ##
##     ## ##       ##     ##  ##  ##       ##  ##  ## ##
########  ######   ##     ##  ##  ######   ##  ##  ##  ######
##   ##   ##        ##   ##   ##  ##       ##  ##  ##       ##
##    ##  ##         ## ##    ##  ##       ##  ##  ## ##    ##
##     ## ########    ###    #### ########  ###  ###   ######

# Parsing top 40 Reviews
    reviews = soup.findAll(itemprop="review")
    for review in reviews:

        # Get user data
        if review.find(title="User is Elite") != None:
            eliteStatus = "Elite"
        else:
            eliteStatus = "None"

        friendCount = review.find(
            class_="friend-count miniOrange").get_text()[:-8].strip()
        reviewCount = review.find(
            class_="review-count miniOrange").get_text()[:-8].strip()

        if review.find(
                class_="photo-box-img"
        )['src'] != "http://s3-media4.ak.yelpcdn.com/assets/2/www/img/78074914700f/default_avatars/user_small_square.png":
            userPhoto = "Has photo"
        else:
            userPhoto = "None"

        reviewInfo = review.find(
            class_="reviewer_info").get_text().encode('utf-8')

        # Get review data
        reviewRating = review.find(itemprop="ratingValue").get("content")
        publish = review.find(itemprop="datePublished").get("content")
        description = review.find(
            itemprop="description").get_text().encode('utf-8')

        # Get review attributes
        if review.find(
                class_=
                "i-wrap ig-wrap-common i-camera-common-wrap badge photo-count"
        ) != None:
            reviewPix = review.find(
                class_=
                "i-wrap ig-wrap-common i-camera-common-wrap badge photo-count"
            ).get_text()[:-6].strip()
        else:
            reviewPix = "None"

        if review.find(
                class_=
                "i-wrap ig-wrap-common i-opentable-badge-common-wrap badge opentable-badge-marker"
        ) != None:
            reviewSeated = "Seated"
        else:
            reviewSeated = "None"

        if review.find(class_="i ig-common i-deal-price-tag-common") != None:
            reviewDeal = "Purchased Deal"
        else:
            reviewDeal = "None"

        if review.find(
                class_=
                "i-wrap ig-wrap-common i-checkin-burst-blue-small-common-wrap badge checkin checkin-irregular"
        ) != None:
            reviewCheckIn = review.find(
                class_=
                "i-wrap ig-wrap-common i-checkin-burst-blue-small-common-wrap badge checkin checkin-irregular"
            ).get_text()[:-14].strip()
        else:
            reviewCheckIn = "None"

        # Special Qype users lack stats
        if review.find(class_="count"):
            usefulfunnycool = review.findAll(class_="count")
            # Get useful, funny, cool statistics
            if usefulfunnycool[0].get_text() != "":
                useful = usefulfunnycool[0].get_text()
            else:
                useful = 0

            if usefulfunnycool[1].get_text() != "":
                funny = usefulfunnycool[1].get_text()
            else:
                funny = 0

            if usefulfunnycool[2].get_text() != "":
                cool = usefulfunnycool[2].get_text()
            else:
                cool = 0
        else:
            useful = 0
            funny = 0
            cool = 0

        with open(filenamerevsc, "ab") as filerev:
            frev = csv.writer(filerev)
            # Writing to CSV
            frev.writerow([
                resturl, eliteStatus, friendCount, reviewCount, userPhoto,
                reviewInfo, reviewRating, publish, description, reviewPix,
                reviewSeated, reviewDeal, reviewCheckIn, useful, funny, cool
            ])