Ejemplo n.º 1
0
def enqueueurls(urls,mylevel):
    global __urlqueue__
    global __queuedurl__
    global jobcondition
    with jobcondition:
        try:
            urlquelock.acquire()
            quedurllock.acquire()
            '''critical section'''
            for url in urls:
                url = url.strip("/")
                '''
                get the redirected url if the url redirects.
                you may use original url without obtaining redirected url if needed.
                to do so, comment the statement url = getredirectedurl(url) below.
                '''

                if litmustesturl(url):
                    #print("original ", url)
                    url = getredirectedurl(url)
                    #print("redirected ", url)

                if litmustesturl(url):
                    __urlqueue__.append(str(mylevel) + "#" + url)
                    __queuedurl__[url] = 1
            '''end critical section'''
        except Exception as e:
            logger.writelog(str(e))
            #print(e)
            pass
        finally:
            urlquelock.release()
            quedurllock.release()
            jobcondition.notifyAll()
Ejemplo n.º 2
0
def deletefrombusythreadlist(threadid):

    global ___busythreadlist__
    global threadavailablecondition
    with threadavailablecondition:
        #___busythreadlist__[threadid].join(0)
        del ___busythreadlist__[threadid]
        logger.writelog("Thread " + str(threadid) + " deleted")
        threadavailablecondition.notifyAll()
Ejemplo n.º 3
0
def readtextfromurl(sourceurl):

    try:
        if ".pdf" in sourceurl.split('/')[-1]:
            return webpagereader.readpdftext(sourceurl)
        elif".ppt" in sourceurl.split('/')[-1]:
            raise Exception("Not supported in version", "Page can not be read in this version of crawler")
        else:
            return webpagereader.readhtmlpage(sourceurl) #default value of removetags=False
    except Exception as e:
        #print("Exception in urlcontentreader:", e)
        logger.writelog("Exception in urlcontentreader: "+ sourceurl + " " + str( e))
        return ""
Ejemplo n.º 4
0
def updateposting(text, url):
    postinglock.acquire()
    '''critical section'''
    try:
        global __successcount__
        global __posting__

        logger.writelog("Pushing content of " + url + " in index")
        invertedindex.pushonposting(text.lower(), url, __posting__, __doclensqr__)
        __successcount__ += 1
        #print("hello")
        '''end critical section'''
    except Exception as e:
        pass
    finally:
        postinglock.release()
Ejemplo n.º 5
0
def getredirectedurl(myurl):

    url = myurl
    global __queuedurl__
    global __failedurl__
    #if myurl in __queuedurl__:
        #return myurl
    #if myurl in __failedurl__:
        #return myurl
    try:
        response = urllib.request.urlopen(myurl)
        url = response.url.split("#")[0]#remove page anchor character
        url = url.split("?")[0]#remove get parameters
    except Exception as e:
        url = myurl
        __failedurl__[url] = 1
        #print(str(e) + ": " + url)
        logger.writelog(str(e) + ": " + url)
        pass
    finally:
        return url
Ejemplo n.º 6
0
def crawlthrough(urlslevels, crawldepth, threadid=0, savedir=None):

    load = len(urlslevels)
    #print("Load ", load, urlslevels,"\n\n", globals.geturlqueue())
    for i in range(0, load):
        url, level = urlslevels[i]
        '''crawl through the url'''
        try:
            logger.writelog("Thread " + str(threadid) + ", Total load=" +
                            str(load) + " Crawled=" + str(i) + ", Crawling " +
                            url)

            #print("crawling ", url)
            rawtext = urlcontentreader.readtextfromurl(url)
            #print("attempted crawling ", url)
            #print(url, "\n", rawtext,"\n")

            #newurls, text = htmlparser.fetch.fetchurlandtext(url,True)
            newurls, cleantext = htmlparser.getcleantextandurls(rawtext, url)
            #print("new len:", len(newurls),newurls)

            text = rawtext
            if not globals.issaverawtext():  #if set to clean html tags
                text = cleantext
            '''
            print(threadid, "here")
            print(text)
            print(threadid,"there")
            '''
            level += 1
            if len(newurls) > 0 and level <= crawldepth:

                globals.enqueueurls(newurls, level)
            if cleantext.count(" ") >= globals.getminpagesize(
            ):  ##save the page content if it contains atleast minimum number of words
                #print("here is it")
                if savedir != None and savedir != "":
                    crawledpagewriter.writetofile(
                        text, savedir,
                        str(globals.generateuniquenumber()) + ".txt", url)
                if globals.pushtoposting():
                    globals.updateposting(text, url)
                else:
                    globals.updatesuccesscount()
            else:
                logger.writelog(
                    "Thread " + str(threadid) +
                    " Error in urlcrawler: too few words in page or page cannot be read"
                )
        except Exception as e:
            logger.writelog("Thread " + str(threadid) + " " + str(e) +
                            " Exception; Exception occured in urlcrawler.")
            pass
        time.sleep(globals.getcrawldelay())
    globals.deletefrombusythreadlist(threadid)
Ejemplo n.º 7
0
def pushtobusythreadlist(threadid,thread):
    global ___busythreadlist__
    global __totalthreaddispatch__
    ___busythreadlist__[threadid] = thread
    logger.writelog("Thread " + str(threadid) + " pushed to busy thread list")
    __totalthreaddispatch__ += 1