Exemple #1
0
def get_vaname(query,verbose=False,debug=False):
    html=get_google_content(query,debug=debug)
    html=html.lower()
    html=htmltool.decode_entity(html)
    html=htmltool.remove_tags(html,repl="||")
    html=htmltool.clean_tags(html,repl="||")
    if debug:
        printu(html)
    
    html=separate(html,repl="||")
    html=replace(html,"")
    html=cjk.half2full(html)

    strings_list=[content.strip() for content in html.split("||") if len(content.strip()) ]
    strings_count=Counter(strings_list)
    for string,count in strings_count.items():
        if count <= 1:
            pass
            #del strings_count[string]

    l=[(string,count*len(string)*(10 if cjk.contain_cjk(string) else 1)) for string,count in strings_count.items()]
    l.sort(key=lambda t:t[1],reverse=True)  #sort by weight

    
    if verbose:
        #dump (word,weight) list
        for string,weight in sorted(l,key=lambda t:t[1]):
            printu("%-6d:%s"%(weight,string))
    ## max
    maxweight_string=(l[0][0])  #print max weight string
    maxweight_substring=[re.sub("[\w \.]+$","",string) for string,weight in l[0:5] if maxweight_string in string]
    return max(maxweight_substring,key=len)
Exemple #2
0
def get_vapic(keyword,path=os.path.abspath(os.path.curdir.decode()),num=3,height=700,width=500,verbose=False,debug=False):
    html=get_google_content_pic_search(keyword,debug=debug)
    #html=html.lower()
    html=htmltool.decode_entity(html)
    html=htmltool.remove_tags(html,repl="||")
    #html=htmltool.clean_tags(html,repl="||")
    if debug:
        printu(html)
    
    imgurls=re.findall("imgurl=([^&]*?.jpg)", html, flags=re.I)
    for url in imgurls:
        if num == 0:
            break

        if verbose:
            printu("try url: %s"%url)
        
        try:
            ## get image from internet
            content=None
            content=get_web_content_with_cache(url)
            if not content:
                continue
            
            ## check image size
            image=get_image_from_buff(content)
            if image.size[0] < 700 or image.size[1] < 500:
                continue ## skip small image
            
            ## save image to path
            filepath=os.path.join(path,os.path.basename(url))
            with open(filepath,"wb+") as f:
                f.write(content)
                num=num-1
                printu("(%4d,%4d)[%-32s] <= [%s]"%(image.size[0],image.size[1],os.path.relpath(filepath),url))
        except Exception as e:
            if verbose:
                printu("Error:%s: %s: %s"%(type(e),str(e),url))