コード例 #1
0
    def __run(self):
        page = 0
        googlequery = ' '.join(self.__keywords)
        google = GoogleSearch(googlequery)
        
        for x in range(0, self.__threadcount):
            with self.__general_lock:
                t = threading.Thread(target=self.__process_url)
                t.start()
                self.__threadList.append(t)
        
        if self.__trace:
            print("Query google with \"%s\"" % (googlequery))

        while page < self.__maxpages:
            links = google.fetch_results(page)
            if len(links) == 0:
                with self.__general_lock:
                    print("Failed to get links from google. Maybe banned")
                    break
            for link in links:
                fileExt = False
                if self.__trace:
                    with self.__general_lock:
                        print("Checking %s" % link)
                for ext in FileDownloader.DATA_EXT:
                    if link["url"].endswith(ext):
                        if self.__trace:
                            with self.__general_lock:
                                print("Found \"%s\" file by extention \"%s\"" % (link["url"], ext))
                        self.__urls.append(link["url"])
                        fileExt = True
                        break

                if fileExt == False and self.__crawler:
                    with self.__urlList_lock:
                        self.__urlList.append(link["url"])
            page = page + 1
            time.sleep(randint(1, 2))
        with self.__urlList_lock:
            self.__linksDone = True

        for tid in self.__threadList:
            with self.__general_lock:
                tid.join()
コード例 #2
0
 def __init__( self, string):
     '''
         Recibe la cadena "Telcel no da el servicio de 3g en la facultad de ingenieria"
         Busca documentos en wikipedia, y Google y los guarda en self.documents.
         '''
     self.documents = []
     #Pruebas
     self.documents = self.documents + GoogleSearch(string).documents
     for word in string.split():
         self.wikipediaSearch(word)
コード例 #3
0
ファイル: test.py プロジェクト: abael/Utils
 def test_google_search(self):
    goo = GoogleSearch()
    for item in goo.get("something"):
        print item
コード例 #4
0
def main():

    data = "../resources/SOusers-Mar13.csv"  # File containing SO user dump
    results = "../resources/features3.csv"  # File where features will be stored
    picPath = "../resources/SOpictures/"  # Directory where pictures will be downloaded

    fr = open(os.path.join(data), 'rb')
    fw = open(os.path.join(results), 'ab')

    if _RANDOM:
        reader = RandomReader(fr)
    else:
        reader = UnicodeReader(fr)

    writer = UnicodeWriter(fw)

    queue = Queue()
    if _FACE:
        faceDetector = FaceDetector()

    threads = []
    SOhashes = {}  # Dictionary of user's hashes

    # Use multiple threads to download and get information
    for i in xrange(10):
        threads.append(Downloader(queue))
        threads[-1].start()

    idx = 0
    size = 4500  # Number of subjects

    for row in reader:
        if idx < size:
            so_uid = row[0]
            so_hash = row[2]
            if (not (SOhashes.has_key(so_hash))):
                SOhashes[so_hash] = so_uid
                if (not isDefaultGravatarPic(so_hash)):
                    data = [so_uid]
                    if _VISUAL_FEATURES:

                        # Download picture
                        filepath = os.path.join('%s%d.jpg' %
                                                (picPath, int(so_uid)))
                        if not os.path.isfile(filepath):
                            queue.put(
                                ('http://www.gravatar.com/avatar/%s' % so_hash,
                                 filepath))
                            time.sleep(2)

                        # Load picture
                        pic = picUtils.loadPicture(filepath)

                        if _FACE:
                            if faceDetector.isFrontFace(
                                    pic) or faceDetector.isProfileFace(pic):
                                data.append(str(True))
                            else:
                                data.append(str(False))

                        if _MOST_COMMON_COLORS:
                            _, f1, _, f2 = picUtils.mostCommonColor(pic)
                            data.append(str(f1 + f2))

                        if _NBCOLORS:
                            data.append(str(picUtils.getNbOfColors(pic)))

                        if _FARTHEST_NEIGHBOR:
                            F1 = picUtils.farthestNeighborMetric(pic, 10)
                            F2 = picUtils.farthestNeighborMetric(pic, 200)
                            data.append(str(F1))
                            data.append(str(F2))
                            if F1 != 0:
                                data.append(str(F2 / F1))
                            else:
                                data.append('?')

                        if _AVERAGE_SATURATION:
                            data.append(str(picUtils.avgSaturation(pic)))

                        if _THRESHOLD_BRIGHTNESS:
                            data.append(str(picUtils.threBrightness(pic, 0.2)))

                    if _GOOGLE:
                        gi = GoogleImage('http://www.gravatar.com/avatar/%s' %
                                         so_hash)
                        bestGuess = gi.getBestGuess()
                        if bestGuess:
                            bestGuess = bestGuess.encode('utf8')
                            data.append(bestGuess)
                            if _WIKIPEDIA:
                                gs = GoogleSearch("%s site:en.wikipedia.org" %
                                                  bestGuess)
                                wikiTitlePage = gs.getWikipediaTitlePage()
                                if wikiTitlePage:
                                    wiki = Wikipedia(wikiTitlePage)
                                    wiki.categoryGraph(4)
                                    nbCats = 10
                                    i = 0
                                    cats = wiki.sortGraphByDegree()
                                    while i < nbCats and i < len(cats):
                                        data.append(str(cats[i]))
                                        i += 1

                    # Write all information collected in the csv file
                    try:
                        print data
                        writer.writerow(data)
                        idx += 1
                    except:
                        print "Error with data"
        else:
            break
    fr.close()
    fw.close()

    # If here, download finished. Stop threads
    for i in xrange(10):
        queue.put((None, None))
コード例 #5
0
ファイル: test.py プロジェクト: abaelhe/Utils
 def test_google_search(self):
     goo = GoogleSearch()
     for item in goo.get("something"):
         print item
コード例 #6
0
ファイル: main.py プロジェクト: abisiaux/gravatars
def main():
    
    data = "../resources/SOusers-Mar13.csv" # File containing SO user dump
    results = "../resources/features3.csv" # File where features will be stored
    picPath = "../resources/SOpictures/" # Directory where pictures will be downloaded
    
    fr = open(os.path.join(data), 'rb')
    fw = open(os.path.join(results), 'ab')
    
    if _RANDOM:
        reader = RandomReader(fr)
    else:
        reader = UnicodeReader(fr)
        
    
    writer = UnicodeWriter(fw)
    
    queue = Queue()
    if _FACE:
        faceDetector = FaceDetector()
    
    threads = []
    SOhashes = {} # Dictionary of user's hashes
        
    # Use multiple threads to download and get information
    for i in xrange(10):
        threads.append(Downloader(queue))
        threads[-1].start()
        
    
    idx = 0
    size = 4500 # Number of subjects
    
    for row in reader:
        if idx < size:
            so_uid = row[0]            
            so_hash = row[2]
            if(not (SOhashes.has_key(so_hash))):
                SOhashes[so_hash] = so_uid
                if(not isDefaultGravatarPic(so_hash)):
                    data = [so_uid]
                    if _VISUAL_FEATURES:
                          
                        # Download picture
                        filepath = os.path.join('%s%d.jpg' % (picPath,int(so_uid)))
                        if not os.path.isfile(filepath):
                            queue.put(('http://www.gravatar.com/avatar/%s' % so_hash, filepath))
                            time.sleep(2)
                              
                        # Load picture
                        pic = picUtils.loadPicture(filepath)
                      
                        if _FACE:
                            if faceDetector.isFrontFace(pic) or faceDetector.isProfileFace(pic):
                                data.append(str(True))
                            else:
                                data.append(str(False))
                          
                        if _MOST_COMMON_COLORS:
                            _, f1, _, f2 = picUtils.mostCommonColor(pic)
                            data.append(str(f1 + f2))
                              
                        if _NBCOLORS:
                            data.append(str(picUtils.getNbOfColors(pic)))
                              
                        if _FARTHEST_NEIGHBOR:
                            F1 = picUtils.farthestNeighborMetric(pic, 10)
                            F2 = picUtils.farthestNeighborMetric(pic, 200)
                            data.append(str(F1))
                            data.append(str(F2))
                            if F1 != 0:
                                data.append(str(F2/F1))
                            else:
                                data.append('?')
                         
                        if _AVERAGE_SATURATION:
                            data.append(str(picUtils.avgSaturation(pic)))
                          
                        if _THRESHOLD_BRIGHTNESS:
                            data.append(str(picUtils.threBrightness(pic, 0.2)))
                          
                    if _GOOGLE:
                        gi = GoogleImage('http://www.gravatar.com/avatar/%s' % so_hash)
                        bestGuess = gi.getBestGuess()
                        if bestGuess:
                            bestGuess = bestGuess.encode('utf8')
                            data.append(bestGuess)
                            if _WIKIPEDIA:
                                gs = GoogleSearch("%s site:en.wikipedia.org" % bestGuess)
                                wikiTitlePage = gs.getWikipediaTitlePage()
                                if wikiTitlePage:
                                    wiki = Wikipedia(wikiTitlePage)
                                    wiki.categoryGraph(4)
                                    nbCats = 10
                                    i = 0
                                    cats = wiki.sortGraphByDegree()
                                    while i<nbCats and i < len(cats):
                                        data.append(str(cats[i]))
                                        i += 1
                     
                      
                    # Write all information collected in the csv file
                    try:
                        print data
                        writer.writerow(data)
                        idx += 1
                    except:
                        print "Error with data"
        else:
            break
    fr.close()
    fw.close()
    
    # If here, download finished. Stop threads
    for i in xrange(10):
        queue.put((None, None))