Python IO.replaceSpecialChars 예제들

프로그래밍 언어: Python

클래스/타입: IO

메소드/함수: replaceSpecialChars

hotexamples.com에서의 예제들: 2

Python IO.replaceSpecialChars - 2개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 IO.replaceSpecialChars 패키지로부터 pymoo에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

load_pickled_generation_dataframe(28)

todisc_list(8)

writebin(7)

read_balls(6)

AA(5)

autosave(4)

load_tradeprice_data(4)

write2data(3)

readoned(3)

readmap(3)

readWords(3)

pickle(3)

autosave_current_map(3)

writemap(3)

load_settings(2)

save_game(2)

json_write(2)

load_chi_file(2)

load_gr_file(2)

load_grid_of_particles(2)

load_high_scores(2)

load_manual(2)

replaceSpecialChars(2)

get_logfolders(2)

removeFile(2)

loadimage(2)

FileManager(2)

readbin(2)

read_sticks(2)

readPickledWords(2)

CmdService(2)

Clear(2)

getFileName(2)

encodingToutf8(2)

GetLine(2)

writePickledWords(2)

COSHBIO(2)

checkFile(2)

ensure_dir(2)

unpickle(2)

writeWords(1)

writedata(1)

read_geo(1)

todisc_txt(1)

Binary2netcdf(1)

Chain(1)

writeXYZ(1)

CSVFileWriter(1)

writeSeed(1)

writeString(1)

예제 #1

파일 보기

파일: Crawling1.py 프로젝트: dicleoztur/subjectivity_detection

def getnewsitem2(url):
    
    f = urllib2.urlopen(url)
    rawhtml = f.read()
    #rawhtml = rawhtml.encode('iso-8859-9')
    f.close()
    encoding = f.headers['content-type'].split('charset=')[-1]

    
    markerTitle1 = '<title>'
    markerTitle2 = '</title>'
    title = extractitem(markerTitle1, markerTitle2, rawhtml)
    title = IO.encodingToutf8(title, encoding)
    title = title.split("/")[0]
    title = IO.replaceSpecialChars(title)
    
    markerText1 = '<div id="metin2" class="fck_li">'
    markerText2 = '<div class="IndexKeywordsHeader"'    # veya 'id="hiddenTitle"'
    text = extractitem(markerText1, markerText2, rawhtml)
    text = nltk.clean_html(text)
    
    text = IO.encodingToutf8(text, encoding)
    text = IO.replaceSpecialChars(text)
    
    return NewsItem(title, "", text, "")

예제 #2

파일 보기

파일: Crawling2.py 프로젝트: dicleoztur/subjectivity_detection

def getnewsitem(resource, url, newsid):

    '''
    hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
       'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
       'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
       'Accept-Encoding': 'none',
       'Accept-Language': 'en-US,en;q=0.8',
       'Connection': 'keep-alive'}
    '''
    req = urllib2.Request(url)
    try:
        f = urllib2.urlopen(req)
    except URLError as e:
        message = ""
        if hasattr(e, 'reason'):
            message += 'Cannot reach a server.'
            message += '\nReason: ' + str(e.reason)
        elif hasattr(e, 'code'):
            message += 'The server couldn\'t fulfill the request.'
            message += '\nError code: ' + str(e.code)
        IO.log_connection_error(resource.name, url, message)
        return None
    else:
        rawhtml = f.read()
        #rawhtml = rawhtml.encode('iso-8859-9')
        f.close()
    
    
    encoding = resource.encoding
    if encoding == "":
        encoding = f.headers['content-type'].split('charset=')[-1]
        resource.setEncoding(encoding)
    
    markerTitle1 = resource.markerTitle1
    markerTitle2 = resource.markerTitle2
    title = extractitem(markerTitle1, markerTitle2, rawhtml)
    title = IO.encodingToutf8(title, encoding)    
    title = re.split(r"[/-]", title)[0]
    title = IO.replaceSpecialChars(title)  
    
    markerDate1 = resource.markerDate1 
    markerDate2 = resource.markerDate2
    date = extractitem(markerDate1, markerDate2, rawhtml)
    date = IO.encodingToutf8(date, encoding)
    #date = nltk.clean_html(date)
    
    #print "markers: ",markerDate1," ",markerDate2
    print "date: ",date
    
    
    markerAuthor1 = resource.markerAuthor1
    markerAuthor2 = resource.markerAuthor2
    author = extractitem(markerAuthor1, markerAuthor2, rawhtml)
    
    markerText1 = resource.markerText1
    markerText2 = resource.markerText2
    text = extractitem(markerText1, markerText2, rawhtml)
    
    #print isinstance(text, str)," ",isinstance(text, unicode)," ",type(text)
    text = IO.encodingToutf8(text, encoding)
    #print isinstance(text, str)," ",isinstance(text, unicode)," ",type(text)
    # text = nltk.clean_html(text)
    
    
    # added due to the mixes in solhaber
    try:
        text = nltk.clean_html(text)
    except HTMLParser.HTMLParseError as e:
        tag = e.__str__().split(",")[0].split(":")[-1][2:-2]
        text = text.replace(tag, "")
        text = nltk.clean_html(text)
    
    '''
    print isinstance(text, str)," ",isinstance(text, unicode)," ",type(text)
    text = text.decode('utf-8', 'ignore')
    print isinstance(text, str)," ",isinstance(text, unicode)," ",type(text)
    text = nltk.clean_html(text)
    
    text = IO.encodingToutf8(text, encoding)
    '''
    text = IO.replaceSpecialChars(text)
    
    if resource.name == "solhaber":
        date = date.split(",")[-1]
        newsid = newsid.split("-")[-1]
    
    return NewsItem(newsid, title, date, text, resource.name, author, url)