Python Processor.clearDuplicate 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: Processor

클래스/타입: Processor

메소드/함수: clearDuplicate

hotexamples.com에서의 예제들: 1

Python Processor.clearDuplicate - 1개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 Processor.Processor.clearDuplicate에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

Processor(30)

run(9)

detect(5)

post_process(3)

trainClassifier(2)

push_input(2)

extract_class_grids(2)

extract_object_grids(2)

passMat(2)

process(2)

generar_mensaje(2)

extract_boxes(2)

getCPI(2)

getImage(2)

load_processor(2)

__init__(2)

peel(1)

object_sort(1)

output(1)

main(1)

load_all(1)

limpiar_mensaje(1)

letras_consecutivas_repetidas(1)

n_core(1)

plot_pca_spectrum(1)

perform_pca(1)

reduce_dimension(1)

solve(1)

show_labels(1)

runCacti(1)

reset(1)

refine_bbox(1)

read_tiff(1)

plot_clusters(1)

read_output(1)

process_quote_arguments(1)

processImage(1)

predict(1)

insert_processor_mode(1)

plot_grid_search(1)

intra_cluster_average(1)

ConvolutionPlot(1)

insert_mode(1)

delete_table(1)

Init(1)

Receive(1)

add_virus(1)

advancedSolve(1)

analyzer(1)

예제 #1

파일 보기

파일: Crawler.py 프로젝트: BoscoTin/MATH4992-Project

class Crawler:
    def __init__(self, numOfLayer):
        self.num = numOfLayer
        self.parent = []
        self.children = []
        self.handled = []
        self.Indexer = Indexer()
        self.Processor = Processor()
        self.Porter = PorterStemmer()
        self.db = []

        link = "http://www.cse.ust.hk/"
        self.parent.append(link)

    def handleLink(self, links):
        processedLinks = self.Processor.waiveUnrelatedDomain(links)
        processedLinks = self.Processor.clearSubfix(processedLinks)
        processedLinks = self.Processor.clearUnwantedFiles(processedLinks)
        processedLinks = self.Processor.changeUrl(processedLinks)

        return self.Processor.clearDuplicate(processedLinks)

    def getOnePage(self):
        parent = self.parent.pop(0)

        if parent not in self.handled:
            print ""
            print "Searching {}".format(parent)
            try:
                request = requests.get(parent, timeout=20)
                # check if the page can be connected successfully
                if request.status_code == requests.codes.ok:
                    soup = BeautifulSoup(request.text, 'html.parser')
                    # get all child links from the site
                    children = []
                    for link in soup.findAll('a', href=True):
                        children.append(urljoin(parent, link.get('href')))

                    children = self.handleLink(children)
                    for child in children:
                        try:
                            mynewstring = child.encode('ascii')
                            print mynewstring
                        except UnicodeEncodeError:
                            print("there are non-ascii characters in there")
                        self.children.append(child)

                    # words exist in database?
                    if parent not in self.db:
                        # get raw text and split it
                        rawtags = soup.find_all('p')
                        temp = []
                        for tag in rawtags:
                            temp = temp + tag.getText().split()
                        # replace punctuation by white space and split
                        words = []
                        for word in temp:
                            rawtext = word.encode('utf-8').strip()
                            rawtext = "".join(i for i in rawtext
                                              if ord(i) < 128)
                            for c in string.punctuation:
                                rawtext = rawtext.replace(c, " ")
                            words += rawtext.split()
                        # process the words
                        processedWords = []
                        for word in words:
                            processedWords.append(self.Porter.stem(word))
                        # give the data to indexer
                        if len(processedWords) != 0:
                            self.Indexer.process(parent, processedWords,
                                                 children)
                        else:
                            print "The document contains no word"

            except requests.exceptions.ConnectionError:
                print "Error in connecting the site."
            except requests.exceptions.Timeout:
                print "Timeout in connecting the site."

        self.handled.append(parent)

    # search by BFS
    def scrape(self):
        all = db.getAll()
        for instance in all:
            self.db.append(instance['url'])
        print len(self.handled)

        for i in range(self.num):
            self.parent = self.handleLink(self.parent)
            print ""
            print "Searching layer {}".format(i)

            if (len(self.parent) == 0):
                break

            for i in range(len(self.parent)):
                self.getOnePage()

            self.parent = self.children
            self.children = []