def get_sportcoat_measurements(html_description, parse_strategy='default'): """Parses a sportcoat listing from seller balearic1 and returns the measurements as a dict. Parameters ---------- html_description : str parse_strategy : str Defaults to 'default' Returns ------- m : MeasurementsCollection instance """ soup = BeautifulSoup(html_description, 'html.parser') try: assert soup.find(string='Approximate Measurements') != None except AssertionError: raise UnrecognizedTemplateHTML( 'Unable to find "Approximate Measurements" string in HTML description', html_string=str(soup)) else: data = ( soup.find(string='Approximate Measurements') # string itself .parent # enclosing <h3> .parent # enclosing <td> .parent # enclosing <tr> .parent # enclosing <tbody> .parent # enclosing <table> ) m_list = [] if parse_strategy == 'default': strings = list(data.stripped_strings) try: m_list.append(Msmt('jacket', 'chest_flat', str2int(strings[2]))) m_list.append(Msmt('jacket', 'sleeve', str2int(strings[4]))) m_list.append(Msmt('jacket', 'shoulders', str2int(strings[6]))) m_list.append(Msmt('jacket', 'waist_flat', str2int(strings[8]))) m_list.append(Msmt('jacket', 'length', str2int(strings[10]))) # TypeError raised by Measurement class if passed non int value # as the measurement value except TypeError as e: raise UnrecognizedMeasurement( 'Instantiation of Measurement class ({}) by parsing html raised a KeyError, \ indicating the parser identified a region that it expected would be a \ measurement value.'.format(e), html_string=str(data)) else: raise UnsupportedParsingStrategy( 'Parsing strategy <{}> is not supported for this category'.format( parse_strategy)) m = MeasurementsCollection(parse_strategy=parse_strategy, parse_html=str(data), measurements_list=m_list) return m
def get_sportcoat_measurements(measurements_table_soup, parse_strategy='default'): """Parses a sportcoat listing from seller balearic1 and returns the measurements as a dict. Parameters ---------- measurements_table_soup : BeautifulSoup instance parse_strategy='default' : str Returns ------- m : MeasurementsCollection instance """ logger.debug('Attempting to parse as a sportcoat') if type(measurements_table_soup) is not BeautifulSoup: raise ValueError('Must be given a BeautifulSoup object') m_list = [] if parse_strategy == 'default': logger.debug('Using parse_strategy={}'.format(parse_strategy)) strings = list(measurements_table_soup.stripped_strings) try: m_list.append(Msmt('jacket', 'chest_flat', str2int(strings[2]))) m_list.append(Msmt('jacket', 'sleeve', str2int(strings[4]))) m_list.append(Msmt('jacket', 'shoulders', str2int(strings[6]))) m_list.append(Msmt('jacket', 'waist_flat', str2int(strings[8]))) m_list.append(Msmt('jacket', 'length', str2int(strings[10]))) # TypeError raised by Measurement class if passed non int value # as the measurement value except TypeError as e: raise UnrecognizedMeasurement(( 'Instantiation of Measurement class ({}) by parsing html raised a KeyError, ' 'indicating the parser identified a region that it expected would be a ' 'measurement value.').format(e), html_string=str(measurements_table_soup)) except IndexError: raise UnrecognizedTemplateHTML(( 'parse_strategy={} expected a differing number of measurements than ' 'what was provided by the template.').format(parse_strategy), html_string=str(data)) else: raise UnsupportedParsingStrategy( 'Parsing strategy <{}> is not supported for this category'.format( parse_strategy)) m = MeasurementsCollection( parse_strategy=parse_strategy, parse_html=str(measurements_table_soup), measurements_list=m_list) return m
def test_parse_fn_finds_raglan(parse_fn, measurements_table_soup_raglan): raglan_jacket_parse = parse_fn(measurements_table_soup_raglan, 'default') measurements = raglan_jacket_parse.measurements_list # Check to see if these Measurement instances are in the measurements list assert any([ measurement.__dict__ == Msmt('jacket', 'sleeve_from_armpit', 19000).__dict__ for measurement in measurements ]) is True assert any([ measurement.__dict__ == Msmt('jacket', 'shoulders_raglan', 0).__dict__ for measurement in measurements ]) is True # Check to be sure shoulders and normal sleeve measurements are NOT in the list assert any( [measurement.attribute == 'shoulders' for measurement in measurements]) is False assert any( [measurement.attribute == 'sleeve' for measurement in measurements]) is False
def dehydrated_ParseResult(): p = ParseResult() p.clothing_type = 'pant' p.meta = { 'parse_strategy': 'default', 'concerns': [], 'parsed_html': '<table cellpadding="0" cellspacing="0" style="font-size: inherit; ">\n<tbody><tr><td class="head" colspan="2"><h3>Approximate Measurements</h3></td></tr>\n<tr>\n<td>Across Waist</td>\n<td>15”</td></tr><tr><td>Across Hips</td>\n<td>18"</td>\n</tr>\n<tr>\n<td>Inseam</td>\n<td>28.5”\xa0</td>\n</tr>\n<tr>\n<td>Cuff Height</td>\n<td>0"</td>\n</tr><tr>\n<td>Material underneath hem</td>\n<td>2.25"</td>\n</tr><tr>\n<td>Width of hem opening</td>\n<td>9"</td>\n</tr><tr>\n<td>Rise</td>\n<td>9.75"</td></tr>\n</tbody></table>' } p.measurements = [ Msmt(category='pant', attribute='waist_flat', measurement_value=15000), Msmt(category='pant', attribute='hips_flat', measurement_value=18000), Msmt(category='pant', attribute='inseam', measurement_value=28500), Msmt(category='pant', attribute='cuff_height', measurement_value=0), Msmt(category='pant', attribute='cuff_width', measurement_value=9000), Msmt(category='pant', attribute='rise', measurement_value=9750) ] return p.json()
def get_sweater_measurements(measurements_table_soup, parse_strategy='default'): """Parses a sweater listing from seller balearic1 and returns a MeasurementsCollection instance. parse_strategy='default' attempts to detect raglan shoulders and sleeves measured from underarm and act accordingly. accordingly. Parameters ---------- measurements_table_soup : BeautifulSoup instance parse_strategy='default' : str Returns ------- m : MeasurementsCollection instance """ logger.debug('Attempting to parse as a sweater') if type(measurements_table_soup) is not BeautifulSoup: raise ValueError('Must be given a BeautifulSoup object') m_list = [] if parse_strategy == 'default': logger.debug('Using parse_strategy={}'.format(parse_strategy)) strings = list(measurements_table_soup.stripped_strings) sweater_uses_raglan = measurements_table_soup.find( string=re.compile('raglan', flags=re.IGNORECASE)) is not None sweater_uses_underarm = measurements_table_soup.find( string=re.compile('underarm', flags=re.IGNORECASE)) is not None if sweater_uses_raglan != sweater_uses_underarm: raise UnrecognizedTemplateHTML( 'Expected template text search for "raglan" and "underarm" to BOTH == True\ or BOTH == False. Found raglan={} and sweater_uses_underarm={}'.format( sweater_uses_raglan, sweater_uses_underarm), html_string=str(measurements_table_soup)) strings = list(measurements_table_soup.stripped_strings) try: m_list.append(Msmt('sweater', 'chest_flat', str2int(strings[2]))) m_list.append(Msmt('sweater', 'length', str2int(strings[8]))) # Handle cases with raglan sleeves or not if sweater_uses_raglan and sweater_uses_underarm: m_list.append(Msmt('sweater', 'sleeve_from_armpit', str2int(strings[4]))) m_list.append(Msmt('sweater', 'shoulders_raglan', 0)) else: m_list.append(Msmt('sweater', 'sleeve', str2int(strings[4]))) m_list.append(Msmt('sweater', 'shoulders', str2int(strings[6]))) # TypeError raised by Measurement class if passed non int value # as the measurement value except TypeError as e: raise UnrecognizedMeasurement(( 'Instantiation of Measurement class ({}) by parsing html raised a KeyError, ' 'indicating the parser identified a region that it expected would be a ' 'measurement value.').format(e), html_string=str(measurements_table_soup)) except IndexError: raise UnrecognizedTemplateHTML(( 'parse_strategy={} expected a differing number of measurements than ' 'what was provided by the template.').format(parse_strategy), html_string=str(measurements_table_soup)) else: raise UnsupportedParsingStrategy( 'Parsing strategy <{}> is not supported for this category'.format(parse_strategy)) m = MeasurementsCollection( parse_strategy=parse_strategy, parse_html=str(measurements_table_soup), measurements_list=m_list) return m
def get_dress_shirt_measurements(measurements_table_soup, parse_strategy='default'): """Parses a dress shirt listing from seller balearic1 and returns a MeasurementsCollection instance. This parsing function cannot provide qualitative diffrentiation between short and long sleeve shirts. Parameters ---------- measurements_table_soup : BeautifulSoup instance parse_strategy='default' : str Returns ------- m : MeasurementsCollection instance """ logger.debug('Attempting to parse as a dress_shirt') if type(measurements_table_soup) is not BeautifulSoup: raise ValueError('Must be given a BeautifulSoup object') m_list = [] if parse_strategy == 'default': logger.debug('Using parse_strategy={}'.format(parse_strategy)) strings = list(measurements_table_soup.stripped_strings) try: m_list.append(Msmt('shirt', 'chest_flat', str2int(strings[2]))) m_list.append(Msmt('shirt', 'shoulders', str2int(strings[6]))) sleeve_length = str2int(strings[4]) if sleeve_length >= 18000: m_list.append(Msmt('shirt', 'sleeve_long', sleeve_length)) elif sleeve_length <= 13000: m_list.append(Msmt('shirt', 'sleeve_short', sleeve_length)) else: raise UnrecognizedMeasurement( 'Expected sleeve length to be <= 13000 or >= 18000. \ Received: <{}>'.format(sleeve_length), html_string=str(measurements_table_soup)) # TypeError raised by Measurement class if passed non int value # as the measurement value except TypeError as e: raise UnrecognizedMeasurement(( 'Instantiation of Measurement class ({}) by parsing html raised a KeyError, ' 'indicating the parser identified a region that it expected would be a ' 'measurement value.').format(e), html_string=str(measurements_table_soup)) except IndexError: raise UnrecognizedTemplateHTML(( 'parse_strategy={} expected a differing number of measurements than ' 'what was provided by the template.').format(parse_strategy), html_string=str(measurements_table_soup)) else: raise UnsupportedParsingStrategy( 'Parsing strategy <{}> is not supported for this category'.format(parse_strategy)) m = MeasurementsCollection( parse_strategy=parse_strategy, parse_html=str(measurements_table_soup), measurements_list=m_list) return m
def get_sweater_measurements(html_description, parse_strategy='default'): """Parses a sweater listing from seller balearic1 and returns a MeasurementsCollection instance. parse_strategy='default' attempts to detect raglan shoulders and sleeves measured from underarm and act accordingly. accordingly. Parameters ---------- html_description : str parse_strategy : str Defaults to 'default' Returns ------- m : MeasurementsCollection instance """ soup = BeautifulSoup(html_description, 'html.parser') try: assert soup.find(string='Approximate Measurements') != None except AssertionError: raise UnrecognizedTemplateHTML( 'Unable to find "Approximate Measurements" string in HTML description', html_string=str(soup)) else: data = ( soup.find(string='Approximate Measurements') # string itself .parent # enclosing <h3> .parent # enclosing <td> .parent # enclosing <tr> .parent # enclosing <tbody> .parent # enclosing <table> ) m_list = [] if parse_strategy == 'default': sweater_uses_raglan = data.find( string=re.compile('raglan', flags=re.IGNORECASE)) is not None sweater_uses_underarm = data.find( string=re.compile('underarm', flags=re.IGNORECASE)) is not None if sweater_uses_raglan != sweater_uses_underarm: raise UnrecognizedTemplateHTML( 'Expected template text search for "raglan" and "underarm" to BOTH == True\ or BOTH == False. Found raglan={} and sweater_uses_underarm={}'.format( sweater_uses_raglan, sweater_uses_underarm), html_string=str(data)) strings = list(data.stripped_strings) try: m_list.append(Msmt('sweater', 'chest_flat', str2int(strings[2]))) m_list.append(Msmt('sweater', 'length', str2int(strings[8]))) # Handle cases with raglan sleeves or not if sweater_uses_raglan and sweater_uses_underarm: m_list.append( Msmt('sweater', 'sleeve_from_armpit', str2int(strings[4]))) m_list.append(Msmt('sweater', 'shoulders_raglan', 0)) else: m_list.append(Msmt('sweater', 'sleeve', str2int(strings[4]))) m_list.append(Msmt('sweater', 'shoulders', str2int(strings[6]))) # TypeError raised by Measurement class if passed non int value # as the measurement value except TypeError as e: raise UnrecognizedMeasurement( 'Instantiation of Measurement class ({}) by parsing html raised a KeyError, \ indicating the parser identified a region that it expected would be a \ measurement value.'.format(e), html_string=str(data)) except IndexError: raise UnrecognizedTemplateHTML( 'Default parsing strategy expected a differing number of measurements than \ what was provided by the template.', html_string=str(data)) else: raise UnsupportedParsingStrategy( 'Parsing strategy <{}> is not supported for this category'.format( parse_strategy)) m = MeasurementsCollection(parse_strategy=parse_strategy, parse_html=str(data), measurements_list=m_list) return m
def get_coat_and_jacket_measurements(html_description, parse_strategy='default'): """Parses a coat/jacket listing from seller balearic1 and returns a MeasurementsCollection instance. Parameters ---------- html_description : str parse_strategy : str Defaults to 'default' Returns ------- m : MeasurementsCollection instance """ logger.debug('Beginning coats&jackets parser') soup = BeautifulSoup(html_description, 'html.parser') try: assert soup.find(string='Approximate Measurements') != None except AssertionError: raise UnrecognizedTemplateHTML( 'Unable to find "Approximate Measurements" string in HTML description', html_string=str(soup)) else: data = ( soup.find(string='Approximate Measurements') # string itself .parent # enclosing <h3> .parent # enclosing <td> .parent # enclosing <tr> .parent # enclosing <tbody> .parent # enclosing <table> ) m_list = [] logger.debug('Using <%s> parsing strategy' % parse_strategy) if parse_strategy == 'default': strings = list(data.stripped_strings) try: m_list.append(Msmt('jacket', 'chest_flat', str2int(strings[2]))) m_list.append(Msmt('jacket', 'sleeve', str2int(strings[4]))) m_list.append(Msmt('jacket', 'shoulders', str2int(strings[6]))) m_list.append(Msmt('jacket', 'length', str2int(strings[8]))) # TypeError raised by Measurement class if passed non int value # as the measurement value except TypeError as e: raise UnrecognizedMeasurement( 'Instantiation of Measurement class ({}) by parsing html raised a KeyError, \ indicating the parser identified a region that it expected would be a \ measurement value.'.format(e), html_string=str(data)) except IndexError: raise UnrecognizedTemplateHTML( 'Default parsing strategy expected a differing number of measurements than \ what was provided by the template.', html_string=str(data)) elif parse_strategy == 'new_v1': search_for = [ PP(attribute='sleeve', pattern=re.compile('sleeve', re.IGNORECASE)), PP(attribute='chest_flat', pattern=re.compile('pit to pit', re.IGNORECASE)), PP(attribute='shoulders', pattern=re.compile('shoulder seams', re.IGNORECASE)), PP(attribute='length', pattern=re.compile('length', re.IGNORECASE)), PP(attribute='waist_flat', pattern=re.compile('waist', re.IGNORECASE), required=False) ] for pp in search_for: pp.category = 'jacket' for parse_param in search_for: logger.debug('Searching template for %s, using pattern <%r>' % (parse_param.attribute, parse_param.pattern)) # Find search phrase in soup navigable_str = data.find(string=parse_param.pattern) if navigable_str is None: if parse_param.required: logger.warn( 'get_coat_and_jacket_measurements method did not find a match for pattern: <%r> in the template HTML.' % parse_param.pattern) raise UnrecognizedTemplateHTML( 'Parser did not find an expected regex pattern: <{}> in template HTML' .format(repr(parse_param.pattern)), html_string=str(data)) else: logger.debug('Did not find match') else: # Find measurement value paired to search phrase msmt = navigable_str.find_parent('td').find_next_sibling( 'td').string if msmt is None: debug.warn( 'get_coat_and_jacket_measurements method could not find a sibling measurement for <%r>' % navigable_str) raise UnrecognizedTemplateHTML( 'Parser could not find accompanying measurement value for <%r>' .format(navigable_str), html_string=str(data)) else: # Convert it from text decimal to integer and add to the list m = Msmt(parse_param.category, parse_param.attribute, str2int(msmt)) logger.debug('Parsed out and built Measurement: <%r>' % m) m_list.append(m) """should_have = [ ('sleeve', 'sleeve'), ('pit to pit', 'chest_flat'), ('shoulder', 'shoulders'), ('length', 'length')] could_have = [ ('waist', 'waist_flat')] for search_phrase, msmt_type in should_have: logger.debug('Searching template for <%r>' % search_phrase) # Find search phrase in soup navigable_str = data.find(string=re.compile(search_phrase, re.IGNORECASE)) if navigable_str is None: logger.warn( 'get_coat_and_jacket_measurements method did not find an expected string: <%r> in the template HTML.' % navigable_str) raise UnrecognizedTemplateHTML( 'Parser did not find an expected string: <{}> in template HTML' .format(navigable_str), html_string=str(data)) else: # Find measurement value paired to search phrase msmt = navigable_str.find_parent('td').find_next_sibling('td').string if msmt is None: logger.warn( 'get_coat_and_jacket_measurements method did not find paired meaasurement for: <%r> in the template HTML.' % navigable_str) raise UnrecognizedMeasurement( 'Search for a measurement value paired to the string <{}> failed.'.format(search_phrase)) # Convert it from text decimal to integer and add to the list m = Msmt('jacket', msmt_type, str2int(msmt)) logger.debug('Parsed out and built Measurement: %r' % m) m_list.append(m) # Build 'could haves' (Spoo doesn't list waist measurements for casual coats and jackets) for search_phrase, msmt_type in could_have: logger.debug('Searching template for <%r>' % search_phrase) # Find search phrase in soup navigable_str = data.find(string=re.compile(search_phrase, re.IGNORECASE)) if navigable_str is None: logger.debug( 'get_coat_and_jacket_measurements method did not find string: <%r> in the template HTML.' % navigable_str) else: # Find measurement value paired to search phrase msmt = navigable_str.find_parent('td').find_next_sibling('td').string if msmt is None: logger.warn( 'get_coat_and_jacket_measurements method did not find paired meaasurement for: <%r> in the template HTML.' % navigable_str) raise UnrecognizedMeasurement( 'Search for a measurement value paired to the string <{}> failed.'.format(search_phrase)) # Convert it from text decimal to integer and add to the list m = Msmt('jacket', msmt_type, str2int(msmt)) logger.debug('Parsed out and built Measurement: %r' % m) m_list.append(m)""" else: raise UnsupportedParsingStrategy( 'Parsing strategy <{}> is not supported for this category'.format( parse_strategy)) m = MeasurementsCollection(parse_strategy=parse_strategy, parse_html=str(data), measurements_list=m_list) return m
def get_dress_shirt_measurement(html_description, parse_strategy='default'): """Parses a dress shirt listing from seller balearic1 and returns a MeasurementsCollection instance. This parsing function cannot provide qualitative diffrentiation between short and long sleeve shirts. Parameters ---------- html_description : str parse_strategy : str Defaults to 'default' Returns ------- m : MeasurementsCollection instance """ soup = BeautifulSoup(html_description, 'html.parser') try: assert soup.find(string='Approximate Measurements') != None except AssertionError: raise UnrecognizedTemplateHTML( 'Unable to find "Approximate Measurements" string in HTML description', html_string=str(soup)) else: data = ( soup.find(string='Approximate Measurements') # string itself .parent # enclosing <h3> .parent # enclosing <td> .parent # enclosing <tr> .parent # enclosing <tbody> .parent # enclosing <table> ) m_list = [] if parse_strategy == 'default': strings = list(data.stripped_strings) try: m_list.append(Msmt('shirt', 'chest_flat', str2int(strings[2]))) m_list.append(Msmt('shirt', 'shoulders', str2int(strings[6]))) sleeve_length = str2int(strings[4]) if sleeve_length >= 18000: m_list.append(Msmt('shirt', 'sleeve_long', sleeve_length)) elif sleeve_length <= 13000: m_list.append(Msmt('shirt', 'sleeve_short', sleeve_length)) else: raise UnrecognizedMeasurement( 'Expected sleeve length to be <= 13000 or >= 18000. \ Received: <{}>'.format(sleeve_length), html_string=str(data)) # TypeError raised by Measurement class if passed non int value # as the measurement value except TypeError as e: raise UnrecognizedMeasurement( 'Instantiation of Measurement class ({}) by parsing html raised a KeyError, \ indicating the parser identified a region that it expected would be a \ measurement value.'.format(e), html_string=str(data)) except IndexError: raise UnrecognizedTemplateHTML( 'Default parsing strategy expected a differing number of measurements than \ what was provided by the template.', html_string=str(data)) else: raise UnsupportedParsingStrategy( 'Parsing strategy <{}> is not supported for this category'.format( parse_strategy)) m = MeasurementsCollection(parse_strategy=parse_strategy, parse_html=str(data), measurements_list=m_list) return m