コード例 #1
0
ファイル: guess_by_tag.py プロジェクト: white812/auto_scraper
def best_match(simple_guess_dict, path, guesses, guesses_value_path_dict):
    from util import longestSubstringFinder
    max_match = 0
    best_guess = None
    for key in simple_guess_dict.keys():
        for possible in simple_guess_dict[key]:
            match_len = len(longestSubstringFinder(path.split('/')[-1], possible))
            if match_len > max_match:
                best_guess = key
                max_match = match_len
    if max_match>0:
        guesses[path].add(best_guess)
        guesses_value_path_dict[best_guess].add(path)
コード例 #2
0
def get_inline_xpath_for_datetime(xpath_from_code, xpath_to_code, dates, times, segment_path):
    if len(dates)!=2: return None, None
    xpath_dates = []
    xpath_times = []
    for date in dates:
        xpath_dates.append(date.replace(segment_path, '.'))

    if times is None:
        if len(longestSubstringFinder(xpath_from_code, xpath_dates[0])) > 0 and \
            len(longestSubstringFinder(xpath_to_code, xpath_dates[1]))>0:
            return inline_xpath_decoration(xpath_dates[0]), inline_xpath_decoration(xpath_dates[1])
        if len(longestSubstringFinder(xpath_from_code, xpath_dates[1])) > 0 and \
            len(longestSubstringFinder(xpath_to_code, xpath_dates[0]))>0:
            return inline_xpath_decoration(xpath_dates[1]), inline_xpath_decoration(xpath_dates[0])
    else:
        if len(times)!=2: return None, None
        for time in times:
            xpath_times.append(time.replace(segment_path, '.'))
        xpath_times = list(xpath_times)
        date_from_str = ''
        time_from_str = ''
        date_to_str = ''
        time_to_str = ''

        if len(longestSubstringFinder(xpath_from_code, xpath_dates[0])) > 0 and \
            len(longestSubstringFinder(xpath_to_code, xpath_dates[1]))>0:
            date_from_str, date_to_str = inline_xpath_decoration(xpath_dates[0]), inline_xpath_decoration(xpath_dates[1])
        if len(longestSubstringFinder(xpath_from_code, xpath_dates[1])) > 0 and \
            len(longestSubstringFinder(xpath_to_code, xpath_dates[0]))>0:
            date_from_str, date_to_str = inline_xpath_decoration(xpath_dates[1]), inline_xpath_decoration(xpath_dates[0])

        if len(longestSubstringFinder(xpath_from_code, xpath_times[0])) > 0 and \
            len(longestSubstringFinder(xpath_to_code, xpath_times[1]))>0:
            time_from_str, time_to_str = inline_xpath_decoration(xpath_times[0]), inline_xpath_decoration(xpath_times[1])
        if len(longestSubstringFinder(xpath_from_code, xpath_times[1])) > 0 and \
            len(longestSubstringFinder(xpath_to_code, xpath_times[0]))>0:
            time_from_str, time_to_str = inline_xpath_decoration(xpath_times[1]), inline_xpath_decoration(xpath_times[0])

        if time_from_str == '' or time_to_str =='' or date_from_str=='' or date_to_str=='':
            return None, None
        else:
            return date_from_str+'+'+time_from_str, date_to_str+'+'+time_to_str