Exemple #1
0
def work2JPG(filename, isPng = False):
    filepath = FileHelper.realpath(filename)
    filedir = FileHelper.dirname(filepath)
    name = FileHelper.basename(filepath)
  
    os.chdir(tpDir)
 
    jpgCMD = """%s -quality 90 %s %s """ % (convertBin, filepath, filepath)                 
    os.system(jpgCMD)  
    #return           
            
    tmpfilename = FileHelper.join(filedir, hashlib.md5(name.encode('utf-8')).hexdigest())
        
    isSuccess = True
    with open(tmpfilename, 'wb+') as tmpFile:
        try: 
            tmpFile.write(b'MNG')
            
            rgbname = filepath 
            FileHelper.writeWithSize(tmpFile, filepath)      

        except Exception:
            Log.printDetailln ("error00 !!!", filename, "cannot convert.")
            isSuccess = False
        finally: 
            pass
    if isSuccess:        
        FileHelper.remove(filepath)
        FileHelper.rename(tmpfilename, filepath)      
        return 5  
    else:
        return 2                
Exemple #2
0
 def __init__(self, language='en'):
     path = os.path.join('text', 'stopwords-%s.txt' % language)
     if language not in self._cached_stop_words:
         self._cached_stop_words[language] = \
             set(FileHelper.loadResourceFile(path).splitlines())
     self.STOP_WORDS = self._cached_stop_words[language]
     self.language = language
def extract_all_links(infile, outfile, base_url=None, mode='a'):
    '''extract links as train and test samples'''
    content = FileHelper.readUTF8File(infile)
    if content is None:
        return None

    docstring = content.lower()
    doctree = HtmlHelper.create_doc(docstring, 'utf-8')

    if base_url is None:
        try:
            base_url = content.splitlines()[0]
        except IndexError or AttributeError:
            return None

    if doctree is None: return None

    doctree = HtmlHelper.pre_process_domtree(doctree)
    if doctree is None:
        return None
    url_items = []
    for item in get_link_word_by_pair(docstring, base_url, None): url_items.append(item)

    with open(outfile, mode) as fw:
        for item in url_items:
            anchor = item['anchor_text']
            url = item['url']
            tmp = anchor.decode('utf-8')
            print url, anchor
            if len(tmp)>5 and isdigit(tmp):
                fw.write('%s\t%s\n' %(url, anchor))
Exemple #4
0
def create_train(src_dirs, destfile):
    with open(destfile, 'a') as fw:
        for f in os.listdir(src_dirs):
            print f
            path = src_dirs + f
            content = FileHelper.readUTF8File(path)
            doc = HtmlHelper.create_doc(content, 'utf-8')
            doc = HtmlHelper.pre_process_domtree(doc)
Exemple #5
0
def get_document(name):
    home = '/mnt/UbutunShare/graduate/DataSet/PageClassification/%s/' %name

    json_dir = home + 'json/'

    mkdirs(home, json_dir)

    history = home+'history.txt'
    visited_files = get_history(history)
    finput = open(history, 'w')


    yule = home+'yule/'
    sport = home +'sport/'
    finance = home +'finance/'
    junshi = home + 'junshi/'


    for dir in [yule, sport, finance, junshi]:
        print dir
        for f in os.listdir(dir):
            if f in visited_files:
                continue
            print '========================================'
            print 'parse file: %s ....' % f

            t = dir + f
            content=FileHelper.readUTF8File(t)
            if content is None:
                continue
            try:
                base_url=content.splitlines()[0]
            except IndexError or AttributeError:
                continue


            document=html2words(content, base_url, encoding='utf-8')
            # if document is None:
            #     os.remove(t)
            #     continue
            # div = document['para']
            #
            # if len(div)<50:
            #     os.remove(t)
            #     continue

            json_file=os.path.join(json_dir, f.split('.')[0] + '.json')
            save(document, json_file)
            finput.writelines(f + '\n')
    finput.close()
Exemple #6
0
def predict(name):
    home = '/mnt/UbutunShare/graduate/DataSet/PageClassification/%s/' %name

    temp = home+'temp/'
    mkdirs(temp)

    supervisior=MultiClassifier('test-3-topic')
    supervisior.load(path='/mnt/UbutunShare/TopicalCrawl/TopicalCrawl/classifier')

    cetd = home + 'plain-text/cetd/'
    wish_ce = home + 'plain-text/wish-ce/'
    remove_tag_ = home + 'plain-text/remove-tag/'

    history = temp+'history.txt'
    visited_files = get_history(history)
    finput = open(history, 'w')


    document_class = {}
    for dir in [cetd, wish_ce, remove_tag_]:
        class_ = '1'
        if 'cetd' in dir:class_ = '0'
        elif 'wish' in dir:class_ = '1'
        else: class_ = '2'
        print dir
        print '========================'
        document_class[class_] = {}

        tmpfile = open(temp+class_+'.txt', 'w')
        for f in os.listdir(dir):
            if f in visited_files:
                continue
            print '==============================='
            print f

            t = dir + f
            content=FileHelper.readUTF8File(t)
            if content is None:
                continue

            predicted_y, precision=supervisior.predict(content)
            tmpfile.write('%s\t%s\n' %(predicted_y, f))

            document_class[class_][f] = predicted_y
            finput.writelines(f + '\n')
        tmpfile.close()
    finput.close()
    json.dump(document_class, open(temp+'test_result.txt', 'w'))
def get_document(path):
    content = FileHelper.readUTF8File(path)
    if content is None:
        return None
    try:
        base_url = content.splitlines()[0]
    except IndexError or AttributeError:
        return None

    docstring = content.lower()
    doctree = HtmlHelper.create_doc(docstring, 'utf-8')

    if doctree is None: return None

    document = html2words(docstring, base_url)

    return document
Exemple #8
0
def create_train_url(name):
    # src_dir = '/mnt/UbutunShare/graduate/DataSet/PageClassification/%s/' % name
    #
    # supervisior=MultiClassifier('test-6-topic')
    # supervisior.load(path='/mnt/UbutunShare/TopicalCrawl/TopicalCrawl/classifier')
    #
    # home = src_dir
    # yule = home+'yule/'
    # sport = home +'sport/'
    # finance = home +'finance/'
    # junshi = home + 'junshi/'
    #
    #
    # url_train_path=os.path.join(src_dir, 'url_train.txt')
    #
    # with open(url_train_path, 'w') as fw:
    #     for original in [yule, sport, finance, junshi]:
    src_dir = '/mnt/UbutunShare/graduate/DataSet/scrapy_dataset/'
    supervisior=MultiClassifier('test-6-topic')
    supervisior.load(path='/mnt/UbutunShare/TopicalCrawl/TopicalCrawl/classifier')
    url_train_path=os.path.join(src_dir, 'url_train.txt')
    with open(url_train_path, 'w') as fw:
        for dir in ['qq/', 'sina/', 'ifeng/', 'sohu/']:
            original = src_dir + dir + 'original/'
            for f in os.listdir(original):
                print '========================================'
                print 'parse file: %s ....' % f
                t=os.path.join(original, f)

                content=FileHelper.readUTF8File(t)
                try:
                    if content is None:
                        continue

                    base_url=content.splitlines()[0]
                except IndexError:
                    continue

                for link in get_link_word_by_pair(content, base_url, supervisior, 'utf-8'):
                    try:
                        fw.write('%s\t%s\t%s\t%s\t%.3f\n' % (
                        link['parent_url'], link['url'], link['anchor_text'], str(link['label']),
                        link['interestness']))
                    except UnicodeDecodeError:
                        continue
Exemple #9
0
def get_page_para(name):
    home = '/mnt/UbutunShare/graduate/DataSet/PageClassification/%s/' %name

    cetd = home + 'plain-text/cetd/'
    wish_ce = home + 'plain-text/wish-ce/'
    remove_tag_ = home + 'plain-text/remove-tag/'

    mkdirs(cetd, wish_ce, remove_tag_)

    history = home+'history.txt'
    visited_files = get_history(history)
    finput = open(history, 'w')


    yule = home+'yule/'
    sport = home +'sport/'
    finance = home +'finance/'
    junshi = home + 'junshi/'


    for dir in [yule, sport, finance, junshi]:
        for f in os.listdir(dir):
            if f in visited_files:
                continue
            print '========================================'
            print 'parse file: %s ....' % f

            t = dir + f
            content=FileHelper.readUTF8File(t)
            if content is None:
                continue
            try:
                base_url=content.splitlines()[0]
            except IndexError or AttributeError:
                continue


            docstring=content.lower()
            doctree=HtmlHelper.create_doc(docstring, 'utf-8')

            if doctree is None: return None

            cetd_doc = copy.deepcopy(doctree)

            try:
                article_c = get_aricle_cetd(cetd_doc)


                doctree=HtmlHelper.pre_process_domtree(doctree)

                article_w, title = HtmlHelper.get_article(doctree)
                article_w = ' '.join([title, article_w])

                article_r = remove_tags(docstring)

                finput.writelines(f + '\n')
            except:
                continue
            f = f.split('.')[0] + '.txt'
            FileHelper.WriteInUTF8(cetd+f, article_c)
            FileHelper.WriteInUTF8(wish_ce+f, article_w)
            FileHelper.WriteInUTF8(remove_tag_+f, article_r)

    finput.close()
Exemple #10
0
def create_train_samples(name, model='test-zh-topic'):

    supervisior, original, history, tree, json_dir,\
    url, url_sample_file, url_sample_pickle, url_sample,\
    document_class, dcoument_class_file = init(name)

    visited_files = get_history(history)
    finput=open(history, 'a')

    list_file = os.listdir(original)
    num = lambda x: int(x.split('.')[0])
    list_file.sort(cmp=lambda a,b:num(a)-num(b))

    for f in list_file:
        if f in visited_files:
            continue
        print '========================================'
        print 'parse file: %s ....' % f
        t=os.path.join(original, f)

        content=FileHelper.readUTF8File(t)
        if content is None:
            continue
        try:
            base_url=content.splitlines()[0]
        except IndexError or AttributeError:
            continue

        document=html2words(content, base_url, encoding='utf-8', supervisior=supervisior)
        if document is None:
            os.remove(t)
            continue

        pure_text='\t'.join([document['title'], document['meta_descrip'], document['para']])

        predicted_y, precision=supervisior.predict(pure_text)

        tree_file=os.path.join(tree, f)
        json_file=os.path.join(json_dir, f.split('.')[0] + '.json')

        save(document, json_file, tree_file)

        key=f.split('.')[0]
        urlitems_file=os.path.join(url, key)

        cPickle.dump(document['url_items'], open(urlitems_file, 'wb'), -1)

        item=gen_url_item(base_url, document['title'], predicted_y, precision)
        url_sample[key]=item
        document_class[key]=predicted_y

        # with open(url_sample_file, 'a') as fu:
        #     fu.write('%s\t%s\t%s\t%s\t%.3f\n' % (
        #     key, encode(item['url']), encode(item['anchor_text']), item['label'], item['interestness']))


        finput.writelines(f + '\n')
    finput.close()

    # cPickle.dump(url_sample, open(url_sample_pickle, 'wb'), -1)
    json.dump(document_class, open(dcoument_class_file, 'w'))
Exemple #11
0
    # import lxml.etree as etree
    # doc1 = etree.fromstring(html1)
    # doc2 = etree.fromstring(html2)
    #
    # root1 = ElementHelper.get_root(doc1)
    # root2 = ElementHelper.get_root(doc2)
    # w = WISH()
    # print w.similar_check(root1, root2)

    dir = '/mnt/UbutunShare/graduate/DataSet/scrapy_dataset/ifeng/original/other_neg_524.html'
    dir = '/mnt/UbutunShare/graduate/DataSet/PageClassification/Test1/yule/yule (55).html'
    dir = '/mnt/UbutunShare/graduate/DataSet/1.txt'
    dir = 'classifier/sample-data/1.html'
    dir = '/mnt/UbutunShare/Work/CETD_DATA/Test/original/0.htm'
    from api import HtmlHelper
    from util import FileHelper
    content = FileHelper.readUTF8File(dir)
    doc = HtmlHelper.create_doc(content, 'utf-8')
    doc = HtmlHelper.pre_process_domtree(doc)
    article, title = HtmlHelper.get_article(doc, True)
    print article.encode('utf-8')









Exemple #12
0
def work_file_PVR(filename, isDTC4Module = False, isTC4 = False):
    filepath = FileHelper.realpath(filename)
    filedir = FileHelper.dirname(filepath)

    sys.stdout.flush() 
       
    os.chdir(toolsPath)
    
    isTC4 = True
    isAlphaJPG = False
    if isDTC4Module:
        isTC4 = False
    
    preCMD = " -p "
    preCMD = ""
    
    info = ImageInfo.size(filepath)
    
    # 只支持png纹理
    if info[0] != 'PNG':
        return 2
    
    width = info[1]
    height = info[2]
    
    # 只支持正方形2的幂纹理
    if width & (width-1) != 0 or width != height:
        return 2
        
    
    rgbCMD = """ %s -f PVRTC1_4_RGB %s -q pvrtcbest -i %s -o %s """ % (pvrTexToolBin, preCMD, filepath, filepath.replace(".png", ".pvr"))
    alphaCMD = """%s %s -alpha extract %s """ % (convertBin, filepath, filepath.replace(".png", ".alpha.jpg"))
    alphaJPGCMD = """ %s -f PVRTC1_4_RGB -q pvrtcbest -i %s -o %s """ % (pvrTexToolBin, filepath.replace(".png", ".alpha.jpg"), filepath.replace(".png", ".alpha.pvr"))
    
    if isTC4:
        rgbCMD = """ %s -f PVRTC1_4 %s -q pvrtcbest -i %s -o %s """ % (pvrTexToolBin, preCMD, filepath, filepath.replace(".png", ".pvr"))
    
    try:   
        FileHelper.remove(filepath.replace(".png", ".pkm"))
        FileHelper.remove(filepath.replace(".png", "_alpha.pkm"))
    
        os.system(rgbCMD) 
        
        if not isTC4:
            os.system(alphaCMD) 
        
        if not isAlphaJPG and not isTC4:
            os.system(alphaJPGCMD) 
        
        if not FileHelper.exists(filepath.replace(".png", ".pvr")):
            return 2
            
        os.rename(filepath.replace(".png", ".pvr"), filepath.replace(".png", ".pkm"))   
        
        if not isTC4:
            if not isAlphaJPG:
                os.rename(filepath.replace(".png", ".alpha.jpg"), filepath.replace(".png", "_alpha.pkm")) 
            else:
                os.rename(filepath.replace(".png", ".alpha.pvr"), filepath.replace(".png", "_alpha.pkm")) 

            FileHelper.remove(filepath.replace(".png", ".alpha.jpg"))
            FileHelper.remove(filepath.replace(".png", ".alpha.pvr"))  
        
    except Exception:
        Log.printError()
        return 2
    finally:
        pass
    
    tmpfilename = filepath.replace(".png", ".tmp")
    FileHelper.remove(tmpfilename)
    
    isSuccess = True
    with open(tmpfilename, 'wb+') as tmpFile:
        try: 
            tmpFile.write(b'MNG')
            
            rgbname = filepath.replace(".png", ".pkm") 
            
            statinfo = os.stat(rgbname)
            fileSize = statinfo.st_size
            
            tmpFile.write(pack("i", fileSize))
            rgbfile = open(rgbname, "rb")
            tmpFile.write(rgbfile.read())
            rgbfile.close()
            
            alphaname = filepath.replace(".png", "_alpha.pkm") 
            if not isTC4:
                statinfo = os.stat(alphaname)
                fileSize = statinfo.st_size
                
                tmpFile.write(pack("i", fileSize))
                alphafile = open(alphaname, "rb")
                tmpFile.write(alphafile.read())
                alphafile.close()
            
            # if preAlpha:
                # tmpFile.write('p')
            # else:
                # tmpFile.write('P')
            
            if not isSaveTransFile:
                FileHelper.remove(rgbname)
                FileHelper.remove(alphaname)
                
        except Exception:
            t, v, tb = sys.exc_info()
            Log.printDetailln(t, v)
            isSuccess = False
            pass
        finally: 
            pass
            
            
    if isSuccess:  
        if isUseGzip:
            gzip_cmd = gzipBin + tmpfilename + " -n -f -9"
            os.system(gzip_cmd)
            FileHelper.remove(tmpfilename.replace(".tmp", ".png"))
            FileHelper.rename(tmpfilename + ".gz", tmpfilename.replace(".tmp", ".png"))
            return 3
        else: 
            FileHelper.remove(tmpfilename.replace(".tmp", ".png"))
            FileHelper.rename(tmpfilename, tmpfilename.replace(".tmp", ".png"))
            return 5
Exemple #13
0
def work_png(filename): 
    filepath = FileHelper.realpath(filename)
    filedir = FileHelper.dirname(filepath)
    os.chdir(tpDir)
    
    isSaveTransFile = False
    isPng = True
    useGZIP = False
    
    if isPng:
        jpgCMD = """%s %s -background black %s """ % (convertBin, filepath, filepath.replace(".png", ".rgb.jpg"))
        alphaCMD = """%s %s -alpha extract %s """ % (convertBin, filepath, filepath.replace(".png", ".alpha.jpg"))        
    
        try:                   
            os.system(jpgCMD) 
            os.system(alphaCMD)   
        except Exception:
            Log.printDetailln ("error33 !!!", filename, "cannot convert.")
            return 2
        finally:
            pass
    
        tmpfilename = filepath.replace(".png", ".tmp")
        FileHelper.remove(tmpfilename)
        
        isSuccess = True
        with open(tmpfilename, 'wb+') as tmpFile:
            try: 
                tmpFile.write(b'MNG')
                
                rgbname = filepath.replace(".png", ".rgb.jpg") 
                FileHelper.writeWithSize(tmpFile, rgbname)
                
                alphaname = filepath.replace(".png", ".alpha.jpg") 
                FileHelper.writeWithSize(tmpFile, alphaname)
                
                if not isSaveTransFile:
                    FileHelper.remove(rgbname)
                    FileHelper.remove(alphaname)
                    
            except Exception:
                Log.printError()
                isSuccess = False
                pass
            finally: 
                pass
                
              
        if isSuccess:  
            if useGZIP:
                gzip_cmd = gzipBin + tmpfilename + " -n -f -9"
                os.system(gzip_cmd)
                FileHelper.remove(tmpfilename.replace(".tmp", ".png"))
                FileHelper.rename(tmpfilename + ".gz", tmpfilename.replace(".tmp", ".png"))
                return 3
            else: 
                FileHelper.remove(tmpfilename.replace(".tmp", ".png"))
                FileHelper.rename(tmpfilename, tmpfilename.replace(".tmp", ".png"))
                return 5
        else:
            return 2
Exemple #14
0
def work_file_ETC(filename, isAlphaJPG = False, isFast = False):
    filepath = FileHelper.realpath(filename)
    filedir = FileHelper.dirname(filepath)

    sys.stdout.flush() 

    #preAlpha = needPreAplha(filedir)
    preAlpha = False
    preCMD = " -p "
    if not preAlpha:
        preCMD = ""
            
    os.chdir(toolsPath)
    
    isPng = True
    if filename.find(".png") != -1:
        isPng = True
    elif filename.find(".jpg") != -1:
        isPng = False
    else:
        return 2
    
    if isFast:
        quality = 'etcfast'
    else:
        quality = 'etcslow'

    rgbCMD = """ %s -f ETC1 %s -q %s -i %s -o %s """ % (pvrTexToolBin, preCMD, quality, filepath, filepath.replace(".png", ".pvr"))
    alphaCMD = """%s %s -alpha extract %s """ % (convertBin, filepath, filepath.replace(".png", ".alpha.jpg"))
    alphaJPGCMD = """ %s -f ETC1 -q %s -i %s -o %s """ % (pvrTexToolBin, quality, filepath.replace(".png", ".alpha.jpg"), filepath.replace(".png", ".alpha.pvr"))
    
    try:   
        if isPng:
            FileHelper.remove(filepath.replace(".png", ".pkm"))
            FileHelper.remove(filepath.replace(".png", "_alpha.pkm"))
        
            os.system(rgbCMD) 
            os.system(alphaCMD) 
            
            if not isAlphaJPG:
                os.system(alphaJPGCMD) 

            FileHelper.rename(filepath.replace(".png", ".pvr"), filepath.replace(".png", ".pkm"))   
            
            if isAlphaJPG:
                FileHelper.rename(filepath.replace(".png", ".alpha.jpg"), filepath.replace(".png", "_alpha.pkm")) 
            else:
                FileHelper.rename(filepath.replace(".png", ".alpha.pvr"), filepath.replace(".png", "_alpha.pkm")) 

            FileHelper.remove(filepath.replace(".png", ".alpha.jpg"))
            FileHelper.remove(filepath.replace(".png", ".alpha.pvr"))            
        else:    
            FileHelper.remove(filepath.replace(".jpg", ".pkm"))         
            rgbCMD = """ %s -f ETC1 -p -q %s -i %s -o %s """ % (pvrTexToolBin, quality, filepath, filepath.replace(".jpg", ".pvr"))
            os.system(rgbCMD)
            FileHelper.rename(filepath.replace(".jpg", ".pvr"), filepath.replace(".jpg", ".pkm"))  
        
    except Exception:
        t, v, tb = sys.exc_info()
        Log.printDetailln(t, v)
        pass
    finally:
        pass
  
    if isPng:   
        tmpfilename = filepath.replace(".png", ".tmp")
        FileHelper.remove(tmpfilename)
        
        isSuccess = True
        with open(tmpfilename, 'wb+') as tmpFile:
            try: 
                tmpFile.write(b'MNG')
                
                rgbname = filepath.replace(".png", ".pkm") 
                alphaname = filepath.replace(".png", "_alpha.pkm") 
                
                FileHelper.writeWithSize(tmpFile, rgbname)
                FileHelper.writeWithSize(tmpFile, alphaname)
                
                # if preAlpha:
                    # tmpFile.write('p')
                # else:
                    # tmpFile.write('P')
                
                if not isSaveTransFile:
                    FileHelper.remove(rgbname)
                    FileHelper.remove(alphaname)
                    
            except Exception:
                Log.printError()
                isSuccess = False
                pass
            finally: 
                pass
                
              
        if isSuccess:  
            if isUseGzip:
                gzip_cmd = gzipBin + tmpfilename + " -n -f -9"
                os.system(gzip_cmd)
                FileHelper.remove(tmpfilename.replace(".tmp", ".png"))
                FileHelper.rename(tmpfilename + ".gz", tmpfilename.replace(".tmp", ".png"))
                return 3
            else: 
                FileHelper.remove(tmpfilename.replace(".tmp", ".png"))
                FileHelper.rename(tmpfilename, tmpfilename.replace(".tmp", ".png"))
                return 5
        else:
            FileHelper.remove(tmpfilename)
            return 2
            
    else:
        tmpfilename = filepath.replace(".jpg", ".pkm") 
        
        if not FileHelper.exists(tmpfilename):
            Log.printDetailln ("error !!!", filepath, "cannot convert.")
            return 2
        
        if isUseGzip:
            gzip_cmd = gzipBin + tmpfilename + " -n -f -9"
            os.system(gzip_cmd)
            FileHelper.remove(tmpfilename.replace(".pkm", ".jpg"))
            FileHelper.rename(tmpfilename + ".gz", tmpfilename.replace(".pkm", ".jpg"))
            return 3
        else:
            FileHelper.remove(tmpfilename.replace(".pkm", ".jpg"))
            FileHelper.rename(tmpfilename, tmpfilename.replace(".pkm", ".jpg"))
            return 4
Exemple #15
0

def CleanTreeByMark(element):
    mark = long(element.get(kg_mark))
    if 0 == mark:
        ElementHelper.remove_element(element)
    elif 1 == mark:
        return
    else:
        for child in element:
            CleanTreeByMark(child)


def get_aricle_cetd(doctree):
    cetd_parse(doctree)
    body = ElementHelper.get_body(doctree)
    # ElementHelper.print_element(body)
    CleanTreeByMark(body)
    RemoveAttribute(body)
    return ElementHelper.element_text_content(body)


if __name__ == '__main__':
    # dir = '/mnt/UbutunShare/graduate/DataSet/scrapy_dataset/other_neg/original/42.html'
    # dir = '/mnt/UbutunShare/graduate/DataSet/original/0.htm'
    dir = '/mnt/UbutunShare/graduate/DataSet/scrapy_dataset/ifeng/image/24.html'
    from api import HtmlHelper
    from util import FileHelper
    content = FileHelper.readUTF8File(dir)
    doc = HtmlHelper.create_doc(content, 'utf-8')
    print get_aricle_cetd(doc)