Ejemplo n.º 1
0
def survey(top_url, h = 3, max_state = 10000, similarity = 0.95):
    #コントロール変数
    target_netloc = _get_netloc(top_url)
    elements = []
    result = {}
    history = {}
    cl = urlclass.fisherclassifier(urlclass.GetUrlFeatures)
    #初期要素
    element = {'url' : top_url, 'step' : 0, 'referer' : '_SearchEngine'}
    elements.append(element)
    history.setdefault(top_url, 1)

    #クローリングしてリンクを抽出する
    j = 0
    while len(elements) > 0 and len(result) < max_state:
        e = elements.pop(0)
        if j % 10 == 0:
            print j, e['step'], len(elements), len(result)
        #URLと遷移数とリンク元をデータとして保存する
        if result.has_key(e['url']) == False:
    
            try:
                site = urlopen(e['url']).read()
                soup = BeautifulSoup(site)
                links = soup.findAll('a')
                result.setdefault(e['url'], {})
                result[e['url']].setdefault(e['step'], [])
                result[e['url']][e['step']].append(e['referer'])
                for link in links:
                    try:
                        url = link['href']
                        netloc = _get_netloc(url)
                        #netlocが自分のサイトであり、遷移数が設定値以下で、未調査URLの場合には新たな調査候補として加える
                        if netloc.find(target_netloc) != -1 and e['step'] + 1 < h and result.has_key(url) == False:
                            exist, redirect_url = httpExists(url) #リンクが存在するかをチェックする
                        #リダイレクト先が存在していなければ、調査候補として加える
                            if exist == True and result.has_key(redirect_url) == False and history.has_key(redirect_url) == False:
                                elements.append({'url' : redirect_url, 'step' : e['step'] + 1, 'referer' : e['url']})
                                history.setdefault(redirect_url, 1)

                    except:
                        pass
            except:
                print 'Crawling ERROR', e['url']
        else:
            result.setdefault(e['url'], {})
            result[e['url']].setdefault(e['step'], [])
            result[e['url']][e['step']].append(e['referer'])
        j += 1
        

    result = find_similar(result, similarity = similarity)
    return result
Ejemplo n.º 2
0
def transition_model(data, uu_volume = 0.01, upper_limit_of_lower_dic = 30,
                     lower_criteria = 0.5, upper_criteria = 0.7, 
                     link_criteria = 0.4, img_criteria = 0.2, script_criteria = 0.2):
    #ゴミURLを除外する
    print "## start URL check ##"
    new_data = cleaning_url_in_data(data, 
                                cl = urlclass.fisherclassifier(urlclass.GetUrlFeatures))
    print '## end ##'
    #URLを構造化する
    print "## start data structuring ##"
    structured = structured_data.get_data(new_data)
    print '## end ##'

    #構造化したURLで遷移状態への採用粒度を決める
    print "## start state adopting ##"
    structured = structured_data.set_state(structured, uu_volume = uu_volume,
                                           upper_limit_of_lower_dic = upper_limit_of_lower_dic,
                                           lower_criteria = lower_criteria,
                                           upper_criteria = upper_criteria,
                                           link_criteria = link_criteria,
                                           img_criteria = img_criteria,
                                           script_criteria = script_criteria)
    print '## end ##'
    print '## show structured data ##'
    for h, s in structured.items():
        for state, v in s.items():
            rate = float(v['state_uu']) / float(v['netloc_uu']) if float(v['netloc_uu']) != 0 else 0
            if rate >= uu_volume:
                print state, ' : ', v['netloc_uu'], ',', v['state_uu'], ',', v['uu'], ',', v['is_state']
    print '## end ##'
    
    #採用粒度のラベルでURLを張り替える
    print "## start relabeling URL ##"
    new_data = change_url(data = new_data, structured_data = structured) 
    print '## end ##'

    #UUトラックデータを作成する
    print "## start making tracking data ##"
    uu_base = make_uu_base_data(new_data)
    print '## end ##'

    #UUトラックデータから回遊遷移モデルを作成する
    print "## start making taransition model ##"
    model = make_model(data = uu_base)
    print '## end ##'

    return model
Ejemplo n.º 3
0
def change_url(data, structured_data):
    from difflib import SequenceMatcher as SM
    new_data = []
    j = 0
    # stateを配列として切り出す
    states = []
    for h, s in structured_data.items():
        for state, v in s.items():
            if v['is_state']:
                states.append(state)
                if len(v['query_state']) > 0:
                    for qs in v['query_state']:
                        states.append(qs)
                
    #ラベルを張り替える
    cl = urlclass.fisherclassifier(urlclass.GetUrlFeatures)
    m = SM()
    for log in data:
        if j % 10000 == 0:
            print 'read: ', j, ', ', len(data)
        j += 1
        #分類器から推定結果を得る
        label = cl.classify(log['url'])
        if label == None:
            m.set_seq1(log['url'])
            match_ratio = {'state' : None, 'match-ratio' : None}
            for state in states:
                m.set_seq2(state)
                if match_ratio['state'] == None or m.ratio() > match_ratio['match_ratio']: 
                    match_ratio['state'] = state
                    match_ratio['match_ratio'] = m.ratio()
            new_data.append({'id' : log['id'], 'url' : match_ratio['state'], 'unixtime' : log['unixtime']})
            cl.train(log['url'], match_ratio['state'])
            cl.setminimum(match_ratio['state'], min = 0.8)
        else:
            new_data.append({'id' : log['id'], 'url' : label, 'unixtime' : log['unixtime']})

    return new_data