def log_uncommitted(state, shift, left_side_shift, right_side_shift, search_offset): logging.debug( '{0}-{1}: shift: {2:0.5f} [{3:0.5f}, {4:0.5f}], search offset: {5:0.6f}' .format(format_time(state["start_time"]), format_time(state["end_time"]), shift, left_side_shift, right_side_shift, search_offset))
def fix_near_borders(events): """ We assume that all lines with diff greater than 5 * (median diff across all events) are broken """ def fix_border(event_list, median_diff): last_ten_diff = np.median([x.diff for x in event_list[:10]], overwrite_input=True) diff_limit = min(last_ten_diff, median_diff) broken = [] for event in event_list: if not 0.2 < (event.diff / diff_limit) < 5: broken.append(event) else: for x in broken: x.link_event(event) return len(broken) return 0 median_diff = np.median([x.diff for x in events], overwrite_input=True) fixed_count = fix_border(events, median_diff) if fixed_count: logging.debug('Fixing {0} border events right after {1}'.format( fixed_count, format_time(events[0].start))) fixed_count = fix_border(list(reversed(events)), median_diff) if fixed_count: logging.debug('Fixing {0} border events right before {1}'.format( fixed_count, format_time(events[-1].end)))
def fix_near_borders(events): """ We assume that all lines with diff greater than 5 * (median diff across all events) are broken """ def fix_border(event_list, median_diff): last_ten_diff = np.median([x.diff for x in event_list[:10]], overwrite_input=True) diff_limit = min(last_ten_diff, median_diff) broken = [] for event in event_list: if not 0.2 < (event.diff / diff_limit) < 5: broken.append(event) else: for x in broken: x.link_event(event) return len(broken) return 0 median_diff = np.median([x.diff for x in events], overwrite_input=True) fixed_count = fix_border(events, median_diff) if fixed_count: logging.info('Fixing {0} border events right after {1}'.format(fixed_count, format_time(events[0].start))) fixed_count = fix_border(list(reversed(events)), median_diff) if fixed_count: logging.info('Fixing {0} border events right before {1}'.format(fixed_count, format_time(events[-1].end)))
def split_broken_groups(groups): correct_groups = [] broken_found = False for g in groups: std = np.std([e.shift for e in g]) if std > MAX_GROUP_STD: logging.warn( u'Shift is not consistent between {0} and {1}, most likely chapters are wrong (std: {2}). ' u'Switching to automatic grouping.'.format( format_time(g[0].start), format_time(g[-1].end), std)) correct_groups.extend(detect_groups(g)) broken_found = True else: correct_groups.append(g) if broken_found: groups_iter = iter(correct_groups) correct_groups = [list(next(groups_iter))] for group in groups_iter: if abs_diff(correct_groups[-1][-1].shift, group[0].shift) >= ALLOWED_ERROR \ or np.std([e.shift for e in group + correct_groups[-1]]) >= MAX_GROUP_STD: correct_groups.append([]) correct_groups[-1].extend(group) return correct_groups
def split_broken_groups(groups, min_auto_group_size): correct_groups = [] broken_found = False for g in groups: std = np.std([e.shift for e in g]) if std > MAX_GROUP_STD: logging.warn( u'Shift is not consistent between {0} and {1}, most likely chapters are wrong (std: {2}). ' u'Switching to automatic grouping.'.format( format_time(g[0].start), format_time(g[-1].end), std)) correct_groups.extend(detect_groups(g, min_auto_group_size)) broken_found = True else: correct_groups.append(g) if broken_found: correct_groups.sort(key=lambda g: g[0].start) i = 0 while i < len(correct_groups) - 1: if abs_diff(correct_groups[i][-1].shift, correct_groups[i + 1][0].shift) < ALLOWED_ERROR \ and np.std([e.shift for e in correct_groups[i] + correct_groups[i + 1]]) < MAX_GROUP_STD: correct_groups[i].extend(correct_groups[i + 1]) del correct_groups[i + 1] else: i += 1 return correct_groups
def snap_groups_to_keyframes(events, chapter_times, max_ts_duration, max_ts_distance, src_keytimes, dst_keytimes, src_timecodes, dst_timecodes, max_kf_distance, kf_mode): if not max_kf_distance: return groups = merge_short_lines_into_groups(events, chapter_times, max_ts_duration, max_ts_distance) if kf_mode == 'all' or kf_mode == 'shift': # step 1: snap events without changing their duration. Useful for some slight audio imprecision correction shifts = [] times = [] for group in groups: shifts.extend( find_keyframe_shift(group, src_keytimes, dst_keytimes, src_timecodes, dst_timecodes, max_kf_distance)) times.extend((group[0].shifted_start, group[-1].shifted_end)) shifts = interpolate_nones(shifts, times) if shifts: mean_shift = np.mean(shifts) shifts = zip(*(iter(shifts), ) * 2) logging.debug('Group {0}-{1} corrected by {2}'.format( format_time(events[0].start), format_time(events[-1].end), mean_shift)) for group, (start_shift, end_shift) in izip(groups, shifts): if abs(start_shift - end_shift) > 0.001 and len(group) > 1: actual_shift = min(start_shift, end_shift, key=lambda x: abs(x - mean_shift)) logging.warning( "Typesetting group at {0} had different shift at start/end points ({1} and {2}). Shifting by {3}." .format(format_time(group[0].start), start_shift, end_shift, actual_shift)) for e in group: e.adjust_shift(actual_shift) else: for e in group: e.adjust_additional_shifts(start_shift, end_shift) if kf_mode == 'all' or kf_mode == 'snap': # step 2: snap start/end times separately for group in groups: if len(group) > 1: pass # we don't snap typesetting start_shift, end_shift = find_keyframes_distances( group[0], src_keytimes, dst_keytimes, src_timecodes, max_kf_distance) if abs(start_shift) > 0.01 or abs(end_shift) > 0.01: logging.debug( 'Snapping {0} to keyframes, start time by {1}, end: {2}'. format(format_time(group[0].start), start_shift, end_shift)) group[0].adjust_additional_shifts(start_shift, end_shift)
def get_store_list(data): """ 返回店铺列表,其中店铺包含国家信息。 :rtype : [{'name':'store name', 'url':'http://...', 'city':'NEW YORK', 'country:':'AUSTRALIA'}, ...] :param data: """ url = data['url'] try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = {'level': 1, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': brand_id} cm.dump(dump_data) return [] store_list = [] for m in re.finditer(ur'<ul>\s+?<h3 class="country-name">(.+?)</h3>', html, re.S): sub, start, end = cm.extract_closure(html[m.start():], ur'<ul>', ur'</ul>') if end == 0: continue # 得到不同国家的分割 splits = [[m1.start(), m1.group(1)] for m1 in re.finditer(ur'<h3 class="country-name">(.+?)</h3>', sub)] splits.append([-1, '']) for i in xrange(len(splits) - 1): # 在同一个国家下寻找 sub1 = sub[splits[i][0]:splits[i + 1][0]] country = splits[i][1].upper() for m1 in re.findall(ur'<li>\s*?<a href="(http://us.christianlouboutin.com/us_en/storelocator/\S+?)">' ur'(.+?)</a>,(.+?)</li>', sub1): store_list.append({'name': m1[1].strip(), 'url': m1[0], 'city': m1[2].strip().upper(), 'country': country})
def fetch_countries(data): """ 获得国家列表 :param data: :return: """ url = data['home_url'] try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = { 'level': 0, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': data['brand_id'] } cm.dump(dump_data) return [] country_list = [] for m in re.findall(ur'<option value="(\d+)" class="3">(.+?)</option>', html): d = data.copy() d['country_id'] = string.atoi(m[0]) d['country_e'] = m[1].strip().upper() country_list.append(d)
def fetch_stores(data): url = data['url'] try: body = cm.post_data(url, {'rsp': 'json', 'country': data['country_code']}) except Exception: print 'Error occured: %s' % url dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] raw = json.loads(body) store_list = [] for s in raw['stores']: entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.name_e] = cm.html2plain(s['name']).strip() addr_list = [] for key in ['address1', 'address2']: if s[key].strip() != '': addr_list.append(cm.reformat_addr(s[key])) entry[cm.addr_e] = ' '.join(addr_list) # r=s['region'].strip().upper() # m = re.search(ur'\b([A-Z]{2})\b', r) # if data[cm.country_e]=='UNITED STATES' and m is not None: # # 美国 # ret = gs.look_up(m.group(1), 2) # if ret is not None: # r = ret['name_e'] # entry[cm.province_e] = r entry[cm.city_e] = cm.extract_city(s['city'])[0] entry[cm.zip_code] = s['zip'].strip() entry[cm.country_e] = data[cm.country_e] entry[cm.lat] = string.atof(s['lat']) entry[cm.lng] = string.atof(s['lng']) entry[cm.tel] = s['phone'].strip() entry[cm.fax] = s['fax'].strip() entry[cm.email] = s['emailaddress'].strip() entry[cm.url] = s['website'].strip() days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] opening = [] if 'openingHours' in s and s['openingHours'] is not None: for m in re.finditer(ur'i:(\d);s:\d+:\\?"([^\\"]+?)\\?"', s['openingHours']): opening.append('%s: %s' % (days[string.atoi(m.group(1))], m.group(2).strip())) entry[cm.hours] = ', '.join(opening) gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.city_e], entry[cm.country_e], entry[cm.continent_e]) store_list.append(entry) db.insert_record(entry, 'stores')
def fetch_stores(data): url = data['url'] try: body = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = { 'level': 0, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': data['brand_id'] } cm.dump(dump_data) return [] body, start, end = cm.extract_closure(body, ur'<article\b', ur'</article>') tmp = [] for m in re.finditer(ur'<h2>\s*(.+?)\s*</h2>', body): tmp.append({ 'idx1': m.start(), 'idx2': m.end(), 'name': m.group(1).strip().upper() })
def fetch_cities(data): url = data['url'] try: body = cm.get_data(url) except Exception: cm.dump('Error in fetching cities: %s' % url, 'shanghaivive_log.txt') dump_data = { 'level': 0, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': data['brand_id'] } cm.dump(dump_data) return [] values = [{ 'city': u'上海', 'code': '021' }, { 'city': u'北京', 'code': '010' }, { 'city': u'成都', 'code': '028' }] results = [] for v in values: d = data.copy() d['city'] = v['city'] d['code'] = v['code'] d['body'] = body results.append(d) return results
def fetch_countries(data): url = data['url'] try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] # 处理重定向 m = re.search('<h2>Object moved to <a href="(.+?)">', html) if m is not None: data['url'] = data['host'] + m.group(1) return fetch_countries(data) m = re.search('<span class="country">Choose a country</span>', html) if m is None: return [] sub, start, end = cm.extract_closure(html[m.end():], r'<ul\b', r'</ul>') if end == 0: return [] country_list = [] for m in re.findall('<li><a .*?href="(.+?)">(.+?)</a></li>', sub): d = data.copy() country_e = cm.html2plain(m[1]).strip().upper() ret = gs.look_up(country_e, 1) if ret is not None: country_e=ret['name_e'] d['country_e'] = country_e d['province_e'] = '' d['url'] = data['host'] + m[0] country_list.append(d) return country_list
def get_store_list(data): """ 获得城市中的商店列表 :param data: :return: """ url = data['url'] try: html = cm.post_data(url, { 'country': data['country_id'], 'city': data['city_id'], 'recordid': -1 }) except Exception: print 'Error occured: %s' % url dump_data = { 'level': 1, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': brand_id } cm.dump(dump_data) return [] pass ret = [] for m in re.findall(ur'<a href=.+?store-(\d+).+?">', html, re.S): store_id = string.atoi(m.strip()) entry = dict(data) entry['store_id'] = store_id ret.append(entry)
def get_countries(data): """ 返回国家列表 :rtype : [{'country_id':**, 'country':**}, ...] :param data: :return: """ url = data['url'] try: html = cm.post_data(url, {'country': -1, 'city': -1, 'recordit': -1}) except Exception: print 'Error occured: %s' % url dump_data = { 'level': 1, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': brand_id } cm.dump(dump_data) return [] ret = [] for m in re.findall( ur'<li>\s*?<a href=.+?country-(\d+).+?">(.+?)<\\/a><\\/li>', html, re.S): country_id = string.atoi(m[0].strip()) country = m[1].replace(r'\r', '').replace(r'\n', '').strip().upper() ret.append({'country_id': country_id, 'country': country, 'url': url})
def fetch_continents(data): url = data['url'] try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = { 'level': 0, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': data['brand_id'] } cm.dump(dump_data) return [] start = html.find('<select class="select_continente">') if start == -1: return [] sub, start, end = cm.extract_closure(html[start:], ur'<select\b', ur'</select>') if end == 0: return [] continent_list = [] for m in re.findall(ur'<option value="(\d+)"\s*>(.+?)</option>', sub): d = data.copy() d[cm.continent_e] = m[1].strip().upper() d['continent_id'] = string.atoi(m[0]) continent_list.append(d)
def split_broken_groups(groups, min_auto_group_size): correct_groups = [] broken_found = False for g in groups: std = np.std([e.shift for e in g]) if std > MAX_GROUP_STD: logging.warn(u'Shift is not consistent between {0} and {1}, most likely chapters are wrong (std: {2}). ' u'Switching to automatic grouping.'.format(format_time(g[0].start), format_time(g[-1].end), std)) correct_groups.extend(detect_groups(g, min_auto_group_size)) broken_found = True else: correct_groups.append(g) if broken_found: correct_groups.sort(key=lambda g: g[0].start) i = 0 while i < len(correct_groups) - 1: if abs_diff(correct_groups[i][-1].shift, correct_groups[i + 1][0].shift) < ALLOWED_ERROR \ and np.std([e.shift for e in correct_groups[i] + correct_groups[i + 1]]) < MAX_GROUP_STD: correct_groups[i].extend(correct_groups[i + 1]) del correct_groups[i + 1] else: i += 1 return correct_groups
def fetch_uk_ireland(data): url = 'http://www.frenchconnection.com/content/stores/united+kingdom.htm' try: body = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = { 'level': 0, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': data['brand_id'] } cm.dump(dump_data) return [] body, start, end = cm.extract_closure(body, ur'<article\b', ur'</article>') start = body.find(ur'<h3>OCEAN TERMINAL</h3>') body1 = body[:start] start2 = body.find(ur'<h3>FRENCH CONNECTION OUTLET</h3>') body2 = body[start + len(ur'<h3>OCEAN TERMINAL</h3>'):start2] body3 = body[start2 + len(ur'<h3>FRENCH CONNECTION OUTLET</h3>'):] tmp = [] for m in re.finditer(ur'<h3>\s*(.+?)\s*</h3>', body1): tmp.append({ 'idx1': m.start(), 'idx2': m.end(), 'name': m.group(1).strip().upper() })
def fetch_countries(data): """ 获得国家列表 :param data: :return: """ url = data['url'] try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = { 'level': 0, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': data['brand_id'] } cm.dump(dump_data) return [] country_list = [] for m in re.findall( ur'<li class="Level4">\s*?<a id="_.+?" href="(.+?)">(.+?)</a>\s*?</li>', html, re.S): data = data.copy() data['country_e'] = m[1].strip().upper() data['url'] = data['host'] + m[0] country_list.append(data)
def fetch_countries(data): url = data['url'] try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] start = html.find('<select name="country" id="inp-country"') if start == -1: return [] sub, start, end = cm.extract_closure(html[start:], ur'<select\b', ur'</select>') if end == 0: return [] country_list = [] for m in re.findall(ur'<option value="([A-Z]{2})">(.*?)</option>', sub): d = data.copy() d['country_code'] = m[0] d[cm.country_c] = m[1].strip() for key in [cm.country_e, cm.continent_e, cm.continent_c]: d[key] = '' ret = gs.look_up(d['country_code'], 1) if ret is not None: d[cm.country_e] = ret['name_e'] d[cm.country_c] = ret['name_c'] d[cm.continent_c] = ret['continent']['name_c'] d[cm.continent_e] = ret['continent']['name_e'] country_list.append(d)
def fetch(level=1, data=None, user='******', passwd=''): # Walk from the root node, where level == 1. if data is None: data = { 'url': 'http://www.mido.cn/zh/retailer_li/POS', 'brand_id': 10260, 'brandname_e': u'MIDO', 'brandname_c': u'美度' } global db db = cm.StoresDb() db.connect_db(user=user, passwd=passwd) db.execute(u'DELETE FROM %s WHERE brand_id=%d' % ('stores', data['brand_id'])) url = data['url'] try: data['html'] = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = { 'level': 0, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': data['brand_id'] } cm.dump(dump_data) return [] store_list = fetch_countries(data) db.disconnect_db()
def get_cities(data): try: d = {"country": data["country_code"], "city": "", "service": -1} html = common.post_data(url, d) except Exception: print "Error occured in getting the list of countries: %s" % url dump_data = {"level": 1, "time": common.format_time(), "data": {"data": url}, "brand_id": brand_id} common.dump(dump_data) return [] start = html.find(u'<select id="city" name="city">') if start == -1: return [] end = html.find(u"</select>", start) html = html[start:end] city_list = [] for m in re.findall(ur'<option value="(.+?)">', html): if data["country_code"] == "GB" and "2 davies street" in m.lower(): continue elif data["country_code"] == "RO" and "13 september street" in m.lower(): continue elif "b1603daq" in m.lower(): continue else: city_list.append({"city_e": m, "country_e": data["country_e"], "country_code": data["country_code"]})
def fetch_cities(data): url = data['home_url'] try: body = cm.get_data(url) except Exception: cm.dump('Error in fetching geo info: %s' % url, 'samsonite_log.txt') dump_data = { 'level': 0, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': data['brand_id'] } cm.dump(dump_data) return [] pat = re.compile(ur"if \(currlan == 'ZHT'\)\s*\{.+?\}", re.S) body = re.sub(pat, '', body) m = re.search(ur'dsy.add\("0",\[(.+?)\]', body) if m is None: cm.dump('Error in fetching geo info: %s' % url, 'samsonite_log.txt') return [] province_list = [m1 for m1 in re.findall(ur'"(.+?)"', m.group(1))] city_list = [] for m in re.findall(ur'dsy.add\("0_(\d+)",\[(.+?)\]', body): for m1 in re.findall(ur'"(.+?)"', m[1]): c = data.copy() c['province'] = province_list[string.atoi(m[0])] c['city'] = m1 city_list.append(c)
def fetch_countries(data): url = data['url'] try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] # 处理重定向 m = re.search('<h2>Object moved to <a href="(.+?)">', html) if m is not None: data['url'] = data['host'] + m.group(1) return fetch_countries(data) m = re.search('<span class="country">Choose a country</span>', html) if m is None: return [] sub, start, end = cm.extract_closure(html[m.end():], r'<ul\b', r'</ul>') if end == 0: return [] country_list = [] for m in re.findall('<li><a .*?href="(.+?)">(.+?)</a></li>', sub): d = data.copy() country_e = cm.html2plain(m[1]).strip().upper() ret = gs.look_up(country_e, 1) if ret is not None: country_e = ret['name_e'] d['country_e'] = country_e d['province_e'] = '' d['url'] = data['host'] + m[0] country_list.append(d) return country_list
def fetch_cities(data): url = data['sel_url'] try: body = cm.post_data( url, { 'continent': data['continent'], 'country': data['country'], 'city': '', 'page': 0 }) except Exception: print 'Error occured: %s' % url dump_data = { 'level': 0, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': data['brand_id'] } cm.dump(dump_data) return [] raw = json.loads(body) city_list = [] for c in raw['city']: d = data.copy() d['city'] = c city_list.append(d) return city_list
def fetch_cities(data): country_id = data['country_id'] try: html = cm.post_data(url, {'country_id': country_id}) except Exception: print 'Error occured: %s' % url dump_data = { 'level': 2, 'time': cm.format_time(), 'data': data, 'brand_id': brand_id } cm.dump(dump_data) return [] start = html.find('<select name="city_id" id="city_id">') if start == -1: return [] start += len('<select name="city_id" id="city_id">') end = html.find('</select>', start) html = html[start:end] city_list = [] for m in re.findall(ur'<option\s.*?value="(\d+).*?">(.*?)</option>', html): entry = {'city': m[1].strip().upper(), 'city_id': string.atoi(m[0])} entry['country'] = data['country'] entry['country_id'] = data['country_id'] city_list.append(entry)
def fetch_cities(data): url = data['home_url'] try: body = cm.get_data(url) except Exception: cm.dump('Error in fetching geo info: %s' % url, 'samsonite_log.txt') dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] pat = re.compile(ur"if \(currlan == 'ZHT'\)\s*\{.+?\}", re.S) body = re.sub(pat, '', body) m = re.search(ur'dsy.add\("0",\[(.+?)\]', body) if m is None: cm.dump('Error in fetching geo info: %s' % url, 'samsonite_log.txt') return [] province_list = [m1 for m1 in re.findall(ur'"(.+?)"', m.group(1))] city_list = [] for m in re.findall(ur'dsy.add\("0_(\d+)",\[(.+?)\]', body): for m1 in re.findall(ur'"(.+?)"', m[1]): c = data.copy() c['province'] = province_list[string.atoi(m[0])] c['city'] = m1 city_list.append(c)
def fetch_store_details(url, data): """ 获得门店的详细信息(url下可能有多个门店) :rtype : [{}] :param url: :param data: """ try: html = cm.get_data(url) except Exception: print 'Error occured: %s / %s' % (str(data), url) dump_data = { 'level': 2, 'time': cm.format_time(), 'data': data, 'brand_id': brand_id } cm.dump(dump_data) return [] # 可能有多个门店,拆分 sub_html = [] for m in re.finditer(ur'<li\s+class\s*=\s*"boutique-info-cadre-\d+"\s*>', html): start = m.start() + len(m.group()) end = html.find('</li>', start) sub_html.append(html[start:end])
def get_continents(data): """ 返回洲列表 :rtype : [{'name':u'欧洲', 'url':'http://....'}, ...] :param data: :return: """ url = data['url'] try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = { 'level': 1, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': brand_id } cm.dump(dump_data) return [] return [{ 'name': m[1], 'url': m[0] } for m in re.findall( ur'<a href="(http://us.christianlouboutin.com/us_en/storelocator/\S+)">(.+?)</a>', html)]
def fetch_countries(data): url = data['url'] try: body = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = { 'level': 0, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': data['brand_id'] } cm.dump(dump_data) return [] start = body.find( u'<option value="0" selected="selected">Select a country</option>') if start == -1: return [] end = body.find(u'</select>', start) country_list = [] for m in re.findall(ur'<option value="([A-Z]{2})"[^>]*>(.+?)</option>', body[start:end]): d = data.copy() # ret=gs.look_up(m[0],1) d['country'] = m[1].strip() d['country_code'] = m[0] country_list.append(d)
def fetch_cities(data): url = data['url'] try: body = cm.post_data( url, { 'searchtype': 'normal', 'reiter_selected': 'reiter1', 'country_id': data['country_code'], 'city_id': 0 }) except Exception: print 'Error occured: %s' % url dump_data = { 'level': 0, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': data['brand_id'] } cm.dump(dump_data) return [] m = re.search(ur'<option value="0"[^>]*>city</option>', body) if m is None: return [] end = body.find(u'</select>', m.end()) city_list = [] for c in re.findall(ur'<option value="(.+?)"[^>]*>.+?</option>', body[m.end():end]): d = data.copy() d['city'] = c city_list.append(d)
def fetch_stores(data): # <h2 property="dc:title" url = data[cm.url] try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] store_list = [] for m in re.finditer(ur'<h2 property="dc:title"', html): end = html.find('</header>', m.start()) if end == -1: continue sub = html[m.start():end] m1 = re.search(ur'<a href="(.+?)">(.+?)</a></h2>', sub) if m1 is None: print 'Error: no more details for %s' % url continue d = data.copy() d[cm.url] = data['host'] + m1.group(1) d[cm.name_e] = cm.html2plain(m1.group(2)).strip() store_list.append(d)
def fetch_continents(data): url = data['store_url'] try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = { 'level': 0, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': data['brand_id'] } cm.dump(dump_data) return [] start = html.find(u'<select id="continent" name="continent"') if start == -1: return [] sub, start, end = cm.extract_closure(html[start:], ur'<select\b', ur'</select') continent_list = [] for m in re.findall(ur'<option value="(.+?)">.+?</option>', sub): d = data.copy() d['continent'] = m continent_list.append(d)
def get_cities(data): try: d = {'country': data['country_code'], 'city': '', 'service': -1} html = common.post_data(url, d) except Exception: print 'Error occured in getting the list of countries: %s' % url dump_data = {'level': 1, 'time': common.format_time(), 'data': {'data': url}, 'brand_id': brand_id} common.dump(dump_data) return [] start = html.find(u'<select id="city" name="city">') if start == -1: return [] end = html.find(u'</select>', start) html = html[start:end] city_list = [] for m in re.findall(ur'<option value="(.+?)">', html): if data['country_code'] == 'GB' and '2 davies street' in m.lower(): continue elif data['country_code'] == 'RO' and '13 september street' in m.lower(): continue elif 'b1603daq' in m.lower(): continue else: city_list.append({'city_e': m, 'country_e': data['country_e'], 'country_code': data['country_code']})
def fetch_stores(data): url = data['url'] try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] store_list = [] for m in re.findall(ur'var markerContent\s*?=\s*?"(.+?)".+?' ur'createMarker\(.+?new google.maps.LatLng\((-?\d+\.\d+),(-?\d+\.\d+)\)', html, re.S): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) lat, lng = map(string.atof, [m[1], m[2]]) cm.update_entry(entry, {cm.lat: lat, cm.lng: lng}) sub = m[0].strip() m1 = re.search(ur'<b>(.+?)</b>', sub) if m1 is None: continue entry[cm.name_c] = m1.group(1) sub = sub.replace(m1.group(0), '') m1=re.search(ur'聯系電話(?::|:)(.+?)<', sub) if m1 is not None: entry[cm.tel]=m1.group(1) sub=sub.replace(m1.group(0), '<') sub = re.sub(ur'<img\b.*?/>', '', sub) entry[cm.addr_c] = cm.reformat_addr(sub) print '(%s/%d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_c], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) store_list.append(entry) db.insert_record(entry, 'stores')
def fetch_cities(data): url = data['host'] + '/ajax/esiajaxProxy.asp' try: body = cm.get_data( url, { 'c': 'FF_StoreLocator2', 'm': 'getCountiesAjax', 'ws': 'ch-ch', 'pid': 178, 'cid': data['country_code'], 'CT': 0 }) except Exception: print 'Error occured: %s' % url dump_data = { 'level': 0, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': data['brand_id'] } cm.dump(dump_data) return [] results = [] for m in re.findall(ur'<li><a href="" data-value="(.+?)">', body): d = data.copy() d['city'] = m results.append(d)
def fetch_store_list(data): url = data['url'] try: body = cm.post_data(url, { 'cCode': data['country_code'], 'city': data['city'], 'postsearch': 1 }) except Exception: print 'Error occured: %s' % url dump_data = { 'level': 0, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': data['brand_id'] } cm.dump(dump_data) return [] results = [] for m in re.finditer(ur'<td class\s*=\s*"ftd"', body): end = body.find('</tr>', m.start()) sub = body[m.start():end] m1 = re.search(ur'<td class="ltd"><a href="(.+?)">', sub) if m1 is None: print 'Cannot find details: %s / %s' % (data['country_code'], data['city']) else: d = data.copy() d['url'] = data['host'] + m1.group(1) results.append(d)
def groups_from_chapters(events, times): logging.info(u'Chapter start points: {0}'.format([format_time(t) for t in times])) groups = [[]] chapter_times = iter(times[1:] + [36000000000]) # very large event at the end current_chapter = next(chapter_times) for event in events: if event.end > current_chapter: groups.append([]) while event.end > current_chapter: current_chapter = next(chapter_times) groups[-1].append(event) groups = filter(None, groups) # non-empty groups # check if we have any groups where every event is linked # for example a chapter with only comments inside broken_groups = [group for group in groups if not any(e for e in group if not e.linked)] if broken_groups: for group in broken_groups: for event in group: parent = event.get_link_chain_end() parent_group = next(group for group in groups if parent in group) parent_group.append(event) del group[:] groups = filter(None, groups) # re-sort the groups again since we might break the order when inserting linked events # sorting everything again is far from optimal but python sorting is very fast for sorted arrays anyway for group in groups: group.sort(key=lambda event: event.start) return groups
def fetch_cities(data): url = data['home_url'] try: body = cm.get_data(url) except Exception: cm.dump('Error in fetching cities: %s' % url, 'canali_log.txt') dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] start = body.find(u'<nav class="countrySelector">') if start == -1: cm.dump('Error occured in fetching country list: %s' % url, 'canali_log.txt') body = cm.extract_closure(body[start:], ur'<nav\b', ur'</nav>')[0] results = [] for m in re.finditer(ur'<li><a href=".+?">(.+?)</a>', body): country = m.group(1).strip().upper() sub = cm.extract_closure(body[m.end():], ur'<ul\b', ur'</ul>')[0] for m1 in re.findall(ur'<li><a class=".+?" href="(.+?)">(.+?)</a></li>', sub): d = data.copy() d['country'] = country d['url'] = data['host'] + m1[0] d['city'] = m1[1].strip().upper() results.append(d)
def get_frag_countries(url): # 获得国家代码 """ 获得国家的名字和代码 :rtype : [{'id':**, 'country':**}, ...] :param url: :return: """ try: html = common.get_data(url) except Exception: print 'Error occured: %s' % url_fragrance dump_data = { 'level': 1, 'time': common.format_time(), 'data': { 'url': url_fragrance }, 'brand_id': brand_id } common.dump(dump_data) return [], False start = html.find('<select name="country" id="id_country">') if start == -1: return [], False sub, s, e = common.extract_closure(html[start:], ur'<select\b', ur'</select>') if e == 0: return [], False return [{ 'id': string.atoi(m[0]), 'country': m[1].strip().upper() } for m in re.findall(ur'<option value="(\d+)".*?>(.+?)</option>', sub)]
def get_frag_countries(url): # 获得国家代码 """ 获得国家的名字和代码 :rtype : [{'id':**, 'country':**}, ...] :param url: :return: """ try: html = common.get_data(url) except Exception: print 'Error occured: %s' % url_fragrance dump_data = {'level': 1, 'time': common.format_time(), 'data': {'url': url_fragrance}, 'brand_id': brand_id} common.dump(dump_data) return [], False start = html.find('<select name="country" id="id_country">') if start == -1: return [], False sub, s, e = common.extract_closure(html[start:], ur'<select\b', ur'</select>') if e == 0: return [], False return [{'id': string.atoi(m[0]), 'country': m[1].strip().upper()} for m in re.findall(ur'<option value="(\d+)".*?>(.+?)</option>', sub)]
def fetch_cities(data): url = data['home_url'] try: body = cm.get_data(url) except Exception: cm.dump('Error in fetching cities: %s' % url, 'unode50_log.txt') dump_data = { 'level': 0, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': data['brand_id'] } cm.dump(dump_data) return [] m = re.search(ur'countries\s*=\s*\{', body) if m is None: cm.dump('Error in fetching cities: %s' % url, 'unode50_log.txt') return [] body = cm.extract_closure(body[m.start():], ur'\{', ur'\}')[0] raw = json.loads(body) results = [] for key in raw: d = data.copy() d['country'] = raw[key]['name'].strip().upper() d['country_id'] = key results.append(d) return results
def get_store_details(data): url = data['url'] try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = {'level': 1, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': brand_id} cm.dump(dump_data) return [] entry = cm.init_store_entry(brand_id, brandname_e, brandname_c) entry[cm.name_e] = data['name'] entry[cm.url] = data['url'] start = html.find(ur'<div class="storelocator-breadcrumbs">') if start == -1: return [] sub, start, end = cm.extract_closure(html[start:], ur'<ul>', ur'</ul>') if end == 0: return [] # 最后一个<li>...</li> m = re.findall(ur'<li>(.+?)</li>', sub, re.S) if len(m) > 0: entry[cm.addr_e] = cm.reformat_addr(m[-1]) # 经纬度 m = re.findall(ur'position: new google.maps.LatLng\((-?\d+\.\d+).*?(-?\d+\.\d+)\)', html) if len(m) > 0: cm.update_entry(entry, {cm.lat: string.atof(m[0][0]), cm.lng: string.atof(m[0][1])}) m = re.search(ur'<div class="contact right">(.+?)</div>', html, re.S) if m is not None: contact_sub = m.group(1) pat_tel = re.compile(ur'<p class="phone">(.+?)</p>') m1 = re.search(pat_tel, contact_sub) if m1: entry[cm.tel] = cm.extract_tel(m1.group(1)) contact_sub = re.sub(pat_tel, '', contact_sub) hours_list=[tmp.strip() for tmp in cm.reformat_addr(contact_sub).split(',')] if 'opening hours' in hours_list[0].lower(): del hours_list[0] entry[cm.hours] = ', '.join(hours_list) # Geo country = data['country'] city = data['city'] cm.update_entry(entry, {cm.country_e: country, cm.city_e: city}) gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s)' % ( brandname_e, brand_id, entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) db.insert_record(entry, 'stores') return entry
def fetch_stores(data): """ 获得商店信息 :param data: :return: """ url = data["post_url"] try: js = json.loads( cm.post_data( url, {"country_id": data["country_id"], "retail_city": "", "retail_type": data["retail_type"]} ).decode("unicode_escape") ) except Exception: print "Error occured in getting country list: %s" % url dump_data = {"level": 1, "time": cm.format_time(), "data": {"url": url}, "brand_id": data["brand_id"]} cm.dump(dump_data) return [] # country_id=108&retail_city=&retail_type=retail # country_id=99&retail_city=&retail_type=service store_list = [] for s in js: entry = cm.init_store_entry(data["brand_id"], data["brandname_e"], data["brandname_c"]) tmp = s["retail_name"].strip() if cm.is_chinese(tmp): entry[cm.name_c] = tmp else: entry[cm.name_e] = tmp entry[cm.addr_e] = s["retail_gmap"].strip() entry[cm.zip_code] = s["retail_zipcode"].strip() entry[cm.city_e] = s["retail_city"].strip().upper() if s["retail_email"] is not None: entry[cm.email] = s["retail_email"].strip() if s["retail_website"] is not None: entry[cm.url] = s["retail_website"].strip() if data["retail_type"] == "retail": entry[cm.store_class] = "Retail" else: entry[cm.store_class] = "Service Center" entry[cm.country_e] = s["country_name"].strip().upper() entry[cm.continent_e] = s["continent_name"].strip().upper() gs.field_sense(entry) print "(%s / %d) Found store: %s, %s (%s, %s)" % ( data["brandname_e"], data["brand_id"], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e], ) store_list.append(entry) db.insert_record(entry, "stores") return store_list
def fetch_uk_home(data): url = 'http://www.frenchconnection.com/content/stores/united+kingdom.htm' try: body = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return []
def fetch_stores(data): """ 获得门店的详细信息 :rtype : [entries] :param data: """ try: html = cm.get_data(data['url']) except Exception: print 'Error occured: %s' % url dump_data = {'level': 1, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': brand_id} cm.dump(dump_data) return [] entries = [] start = html.find(u'<ul class="store-list">') if start == -1: return entries start += len(u'<ul class="store-list">') end = html.find(u'</ul>', start) html = html[start:end] for m1 in re.findall(ur'<li class="(.*?)">(.*?)</li>', html, re.S): store = cm.init_store_entry(brand_id, brandname_e, brandname_c) store[cm.store_type] = m1[0] sub_html = m1[1] m2 = re.findall(ur'<h3 class="store-name">(.*?)</h3>', sub_html) if len(m2) > 0: store[cm.name_e] = cm.reformat_addr(m2[0]) m2 = re.findall(ur'<p class="store-address">(.*?)</p>', sub_html, re.S) if len(m2) > 0: store[cm.addr_e] = cm.reformat_addr(m2[0]) cm.update_entry(store, {cm.continent_e: data[cm.continent_e].strip().upper(), cm.country_e: data[cm.country_e].strip().upper(), cm.city_e: data[cm.city_e].strip().upper()}) entry = store gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e]) if ret[0] is not None and entry[cm.country_e] == '': entry[cm.country_e] = ret[0] if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) entry[cm.city_e] = cm.extract_city(entry[cm.city_e])[0] print '%s: Found store: %s, %s (%s, %s)' % ( brandname_e, store[cm.name_e], store[cm.addr_e], store[cm.country_e], store[cm.continent_e]) db.insert_record(store, 'stores') entries.append(store)
def snap_groups_to_keyframes(events, chapter_times, max_ts_duration, max_ts_distance, src_keytimes, dst_keytimes, src_timecodes, dst_timecodes, max_kf_distance, kf_mode): if not max_kf_distance: return groups = merge_short_lines_into_groups(events, chapter_times, max_ts_duration, max_ts_distance) if kf_mode == 'all' or kf_mode == 'shift': # step 1: snap events without changing their duration. Useful for some slight audio imprecision correction shifts = [] times = [] for group in groups: shifts.extend(find_keyframe_shift(group, src_keytimes, dst_keytimes, src_timecodes, dst_timecodes, max_kf_distance)) times.extend((group[0].shifted_start, group[-1].shifted_end)) shifts = interpolate_nones(shifts, times) if shifts: mean_shift = np.mean(shifts) shifts = zip(*(iter(shifts), ) * 2) logging.info('Group {0}-{1} corrected by {2}'.format(format_time(events[0].start), format_time(events[-1].end), mean_shift)) for group, (start_shift, end_shift) in izip(groups, shifts): if abs(start_shift-end_shift) > 0.001 and len(group) > 1: actual_shift = min(start_shift, end_shift, key=lambda x: abs(x - mean_shift)) logging.warning("Typesetting group at {0} had different shift at start/end points ({1} and {2}). Shifting by {3}." .format(format_time(group[0].start), start_shift, end_shift, actual_shift)) for e in group: e.adjust_shift(actual_shift) else: for e in group: e.adjust_additional_shifts(start_shift, end_shift) if kf_mode == 'all' or kf_mode == 'snap': # step 2: snap start/end times separately for group in groups: if len(group) > 1: pass # we don't snap typesetting start_shift, end_shift = find_keyframes_distances(group[0], src_keytimes, dst_keytimes, src_timecodes, max_kf_distance) if abs(start_shift) > 0.01 or abs(end_shift) > 0.01: logging.info('Snapping {0} to keyframes, start time by {1}, end: {2}'.format(format_time(group[0].start), start_shift, end_shift)) group[0].adjust_additional_shifts(start_shift, end_shift)
def fetch_stores(data): """ 获得商店信息 :param data: """ url = data['url'] try: info = json.loads(cm.get_data(url, {'tskay': data['key_term']})) except Exception: print 'Error occured: %s' % url dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] raw_list = info['shops'] store_list = [] for s in raw_list: entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.city_e] = s['city'].strip().upper() entry[cm.country_e] = data['country_e'].strip().upper() entry[cm.name_e] = s['name'].strip() addr = s['address'] entry[cm.addr_e] = addr terms = addr.split(',') if len(terms) > 1 and entry[cm.city_e] in terms[-1].strip().upper(): country = entry['country_e'] tmp = gs.look_up(country, 1) if tmp is not None: country = tmp['name_e'] if country == 'JAPAN': # 日本邮编 m = re.search(ur'\d{3,}[ -\.]+?\d{3,}', terms[-1]) if m is not None: entry[cm.zip_code] = m.group(0) else: m = re.search(ur'\d{4,}', terms[-1]) if m is not None: entry[cm.zip_code] = m.group(0) entry[cm.tel] = s['tel'] entry[cm.fax] = s['fax'] entry[cm.email] = s['email'] gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) store_list.append(entry) db.insert_record(entry, 'stores') store_list.append(entry) return store_list
def get_stores(data): # data[StoreLocator][pays]=BO url = data['url'] try: html = cm.post_data(url, {'data[StoreLocator][pays]': data['country_code'], 'data[StoreLocator][ville]': '', 'data[StoreLocator][etat]': 0}) except Exception, e: print 'Error occured: %s, %s' % (url, str(e)) dump_data = {'level': 1, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': brand_id} cm.dump(dump_data) return []
def fetch_stores(data): url = data['post_shops'] param = {'city': data['city_e'], 'paulandjoe_women': 0, 'paulandjoe_man': 0, 'paulandjoe_sister': 0, 'paulandjoe_little': 0, 'paulandjoe_beauty': 0} try: html = cm.post_data(url, param) except Exception: print 'Error occured: %s' % url dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] store_list = [] try: for store in (pq(tmp) for tmp in pq(html)('ul')): try: entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.name_e] = cm.html2plain(store('li.first')[0].text).strip() entry[cm.country_e] = data[cm.country_e] entry[cm.city_e] = data[cm.city_e] addr_list = [] for term in (cm.reformat_addr(unicode(pq(tmp))) for tmp in store('li[class!="first"]')): if term != '': addr_list.append(term) tel = cm.extract_tel(addr_list[-1]) if tel != '': entry[cm.tel] = tel del addr_list[-1] entry[cm.addr_e] = ', '.join(addr_list) gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e]) if ret[0] is not None and entry[cm.country_e] == '': entry[cm.country_e] = ret[0] if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) print '(%s/%d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) store_list.append(entry) db.insert_record(entry, 'stores') except (IndexError, TypeError) as e: cm.dump(u'Error in parsing %s, %s' % (url, param), log_name) print traceback.format_exc() continue except Exception, e: print traceback.format_exc()
def fetch_stores(data): url = data["url"] try: body = cm.get_data(url) except Exception: print "Error occured: %s" % url dump_data = {"level": 0, "time": cm.format_time(), "data": {"url": url}, "brand_id": data["brand_id"]} cm.dump(dump_data) return [] if data["name"] == "UK" or data["name"] == "US" or data["name"] == "JAPAN" or data["name"] == "AUSTRALIA": return fetch_uk(body, data) else: return fetch_world(body, data)
def get_coordinates(url): try: html = common.get_data(url) except Exception: print 'Error occured in retrieving the coordinates: %s' % url dump_data = {'level': 2, 'time': common.format_time(), 'data': {'data': url}, 'brand_id': brand_id} common.dump(dump_data) return [] m = re.findall(ur'new google.maps.LatLng\(\s*?(-?\d+\.\d+)\s*?,\s*?(-?\d+\.\d+)\s*?\)', html) if len(m) > 0: return [string.atof(m[0][0]), string.atof(m[0][1])] else: return ['', '']
def fetch_indv(data): url = data['url'] try: body = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] body, start, end = cm.extract_closure(body, ur'<article\b', ur'</article>') tmp = [] for m in re.finditer(ur'<h2>\s*(.+?)\s*</h2>', body): tmp.append({'idx1': m.start(), 'idx2': m.end(), 'name': m.group(1).strip().upper()})
def fetch_store_list(url): """ 获得门店的列表 :rtype : 门店列表。格式:[{'name':**, 'lat':**, 'lng':**, 'type':**, 'url':**}] :param url: """ try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = {'level': 1, 'time': cm.format_time(), 'data': {'data': url}, 'brand_id': brand_id} cm.dump(dump_data) return [] # 开始解析工作 # 查找数据部分,位于var items和var\s\w+之间 start = html.find('var items') if start == -1: return {} start += len('var items') end = html.find('var ', start) html = html[start:end] stores = [] pattern = ur'\[(.+?)\]' store_list = [] for m in re.findall(pattern, html, re.S): store_entry = {} m_list = re.findall(ur"'(.*)'", m) try: store_entry['name'] = cm.html2plain(m_list[0].strip()) store_entry['type'] = m_list[2].strip() store_entry['url'] = m_list[4].strip() except IndexError: print 'Index error: %s' % m # 去掉引号之间的内容,准备查找经纬度信息 m_list = re.findall(ur'(-?\d+\.\d+)', re.subn(ur"'(.*)'", '', m)[0]) try: lat = string.atof(m_list[0]) lng = string.atof(m_list[1]) store_entry['lat'] = lat store_entry['lng'] = lng except (IndexError, ValueError): print 'Index error in getting coordinates: %s' % m # test # if 'hong-kong' in store_entry['url'] or 'taichung' in store_entry['url']: if len(store_entry.keys()) > 0: store_list.append(store_entry) return store_list
def fetch_details(data): url = data[cm.url] try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.name_e] = data[cm.name_e] start = html.find(ur'<div class="field-address">') if start == -1: return [] sub, start, end = cm.extract_closure(html[start:], ur'<div\b', ur'</div>') if end == 0: return [] m1 = re.search(ur'<div class="locality">(.+?)</div>', sub) if m1 is not None: entry[cm.city_e] = cm.extract_city(m1.group(1))[0] m1 = re.search(ur'<div class="postal-code">(.+?)</div>', sub) if m1 is not None: entry[cm.zip_code] = m1.group(1).strip() entry[cm.country_e] = data[cm.country_e] pat = re.compile(ur'<[^<>]+?>', re.S) entry[cm.addr_e] = cm.reformat_addr(re.sub(pat, u'\r\n', sub)) m1 = re.search(ur'<div class="field-telephone"><a href=".+?" class="tel">(.+?)</a></div>', html) if m1 is not None: entry[cm.tel] = m1.group(1).strip() m1 = re.search(ur'<div class="field-opening-hours">\s*<p>(.+?)</p>\s*</div>', html, re.S) if m1 is not None: entry[cm.hours] = cm.reformat_addr(m1.group(1)) m1 = re.search(ur'"coordinates":\[(-?\d+\.\d{4,})\s*,\s*(-?\d+\.\d{4,})\]', html) if m1 is not None: lat = string.atof(m1.group(1)) lng = string.atof(m1.group(2)) cm.update_entry(entry, {cm.lat: lat, cm.lng: lng}) entry[cm.continent_e] = data[cm.continent_e] gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) db.insert_record(entry, 'stores') return [entry]
def fetch_stores(data): url = data['store_url'] try: body = cm.get_data(url, {'country': data['country'], 'city': data['city']}) except Exception: print 'Error occured: %s' % url dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] raw = json.loads(body) store_list = [] for item in raw['items']: entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.country_e] = data['country'].strip().upper() tmp = cm.extract_city(data['city'])[0] if entry[cm.country_e] == 'USA': entry[cm.province_e] = tmp else: entry[cm.city_e] = tmp gs.field_sense(entry) addr = cm.reformat_addr(item['address'].replace(u'\\', '')) addr_list = [tmp.strip() for tmp in addr.split(',')] tel = cm.extract_tel(addr_list[-1]) if tel !='': entry[cm.tel]=tel del addr_list[-1] entry[cm.addr_e]=', '.join(addr_list) entry[cm.store_type] = item['shop_type'] gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e]) if ret[0] is not None and entry[cm.country_e] == '': entry[cm.country_e] = ret[0] if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) db.insert_record(entry, 'stores') store_list.append(entry) return store_list
def fetch_countries(data): url = data['home_url'] try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] country_list = [] for m in re.findall(ur'<option value="[\w ]+?">(.+?)</option>', html): d = data.copy() d['country_e'] = m country_list.append(d)
def fetch_stores(data): url = data['url'] try: body = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] result = [] for m in re.findall(ur'<li class="store">.+?<a href="(.+?)".+?</li>', body, re.S): d = data.copy() d['url'] = m.strip() result.append(d)
def fetch_cities(data): url = data['data_url'] try: body = cm.get_data(url, {'country_code': data['country'], 'toget': 'citylist'}) except Exception: cm.dump('Error in fetching cities: %s, %s' % (url, data['country']), 'benetton_log.txt', False) dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] results = [] for m in re.findall(ur'<option value=\\"(.+?)\\">', body): d = data.copy() d['city'] = m.strip().upper() results.append(d)