def __run(self): page = 0 googlequery = ' '.join(self.__keywords) google = GoogleSearch(googlequery) for x in range(0, self.__threadcount): with self.__general_lock: t = threading.Thread(target=self.__process_url) t.start() self.__threadList.append(t) if self.__trace: print("Query google with \"%s\"" % (googlequery)) while page < self.__maxpages: links = google.fetch_results(page) if len(links) == 0: with self.__general_lock: print("Failed to get links from google. Maybe banned") break for link in links: fileExt = False if self.__trace: with self.__general_lock: print("Checking %s" % link) for ext in FileDownloader.DATA_EXT: if link["url"].endswith(ext): if self.__trace: with self.__general_lock: print("Found \"%s\" file by extention \"%s\"" % (link["url"], ext)) self.__urls.append(link["url"]) fileExt = True break if fileExt == False and self.__crawler: with self.__urlList_lock: self.__urlList.append(link["url"]) page = page + 1 time.sleep(randint(1, 2)) with self.__urlList_lock: self.__linksDone = True for tid in self.__threadList: with self.__general_lock: tid.join()
def __init__( self, string): ''' Recibe la cadena "Telcel no da el servicio de 3g en la facultad de ingenieria" Busca documentos en wikipedia, y Google y los guarda en self.documents. ''' self.documents = [] #Pruebas self.documents = self.documents + GoogleSearch(string).documents for word in string.split(): self.wikipediaSearch(word)
def test_google_search(self): goo = GoogleSearch() for item in goo.get("something"): print item
def main(): data = "../resources/SOusers-Mar13.csv" # File containing SO user dump results = "../resources/features3.csv" # File where features will be stored picPath = "../resources/SOpictures/" # Directory where pictures will be downloaded fr = open(os.path.join(data), 'rb') fw = open(os.path.join(results), 'ab') if _RANDOM: reader = RandomReader(fr) else: reader = UnicodeReader(fr) writer = UnicodeWriter(fw) queue = Queue() if _FACE: faceDetector = FaceDetector() threads = [] SOhashes = {} # Dictionary of user's hashes # Use multiple threads to download and get information for i in xrange(10): threads.append(Downloader(queue)) threads[-1].start() idx = 0 size = 4500 # Number of subjects for row in reader: if idx < size: so_uid = row[0] so_hash = row[2] if (not (SOhashes.has_key(so_hash))): SOhashes[so_hash] = so_uid if (not isDefaultGravatarPic(so_hash)): data = [so_uid] if _VISUAL_FEATURES: # Download picture filepath = os.path.join('%s%d.jpg' % (picPath, int(so_uid))) if not os.path.isfile(filepath): queue.put( ('http://www.gravatar.com/avatar/%s' % so_hash, filepath)) time.sleep(2) # Load picture pic = picUtils.loadPicture(filepath) if _FACE: if faceDetector.isFrontFace( pic) or faceDetector.isProfileFace(pic): data.append(str(True)) else: data.append(str(False)) if _MOST_COMMON_COLORS: _, f1, _, f2 = picUtils.mostCommonColor(pic) data.append(str(f1 + f2)) if _NBCOLORS: data.append(str(picUtils.getNbOfColors(pic))) if _FARTHEST_NEIGHBOR: F1 = picUtils.farthestNeighborMetric(pic, 10) F2 = picUtils.farthestNeighborMetric(pic, 200) data.append(str(F1)) data.append(str(F2)) if F1 != 0: data.append(str(F2 / F1)) else: data.append('?') if _AVERAGE_SATURATION: data.append(str(picUtils.avgSaturation(pic))) if _THRESHOLD_BRIGHTNESS: data.append(str(picUtils.threBrightness(pic, 0.2))) if _GOOGLE: gi = GoogleImage('http://www.gravatar.com/avatar/%s' % so_hash) bestGuess = gi.getBestGuess() if bestGuess: bestGuess = bestGuess.encode('utf8') data.append(bestGuess) if _WIKIPEDIA: gs = GoogleSearch("%s site:en.wikipedia.org" % bestGuess) wikiTitlePage = gs.getWikipediaTitlePage() if wikiTitlePage: wiki = Wikipedia(wikiTitlePage) wiki.categoryGraph(4) nbCats = 10 i = 0 cats = wiki.sortGraphByDegree() while i < nbCats and i < len(cats): data.append(str(cats[i])) i += 1 # Write all information collected in the csv file try: print data writer.writerow(data) idx += 1 except: print "Error with data" else: break fr.close() fw.close() # If here, download finished. Stop threads for i in xrange(10): queue.put((None, None))
def main(): data = "../resources/SOusers-Mar13.csv" # File containing SO user dump results = "../resources/features3.csv" # File where features will be stored picPath = "../resources/SOpictures/" # Directory where pictures will be downloaded fr = open(os.path.join(data), 'rb') fw = open(os.path.join(results), 'ab') if _RANDOM: reader = RandomReader(fr) else: reader = UnicodeReader(fr) writer = UnicodeWriter(fw) queue = Queue() if _FACE: faceDetector = FaceDetector() threads = [] SOhashes = {} # Dictionary of user's hashes # Use multiple threads to download and get information for i in xrange(10): threads.append(Downloader(queue)) threads[-1].start() idx = 0 size = 4500 # Number of subjects for row in reader: if idx < size: so_uid = row[0] so_hash = row[2] if(not (SOhashes.has_key(so_hash))): SOhashes[so_hash] = so_uid if(not isDefaultGravatarPic(so_hash)): data = [so_uid] if _VISUAL_FEATURES: # Download picture filepath = os.path.join('%s%d.jpg' % (picPath,int(so_uid))) if not os.path.isfile(filepath): queue.put(('http://www.gravatar.com/avatar/%s' % so_hash, filepath)) time.sleep(2) # Load picture pic = picUtils.loadPicture(filepath) if _FACE: if faceDetector.isFrontFace(pic) or faceDetector.isProfileFace(pic): data.append(str(True)) else: data.append(str(False)) if _MOST_COMMON_COLORS: _, f1, _, f2 = picUtils.mostCommonColor(pic) data.append(str(f1 + f2)) if _NBCOLORS: data.append(str(picUtils.getNbOfColors(pic))) if _FARTHEST_NEIGHBOR: F1 = picUtils.farthestNeighborMetric(pic, 10) F2 = picUtils.farthestNeighborMetric(pic, 200) data.append(str(F1)) data.append(str(F2)) if F1 != 0: data.append(str(F2/F1)) else: data.append('?') if _AVERAGE_SATURATION: data.append(str(picUtils.avgSaturation(pic))) if _THRESHOLD_BRIGHTNESS: data.append(str(picUtils.threBrightness(pic, 0.2))) if _GOOGLE: gi = GoogleImage('http://www.gravatar.com/avatar/%s' % so_hash) bestGuess = gi.getBestGuess() if bestGuess: bestGuess = bestGuess.encode('utf8') data.append(bestGuess) if _WIKIPEDIA: gs = GoogleSearch("%s site:en.wikipedia.org" % bestGuess) wikiTitlePage = gs.getWikipediaTitlePage() if wikiTitlePage: wiki = Wikipedia(wikiTitlePage) wiki.categoryGraph(4) nbCats = 10 i = 0 cats = wiki.sortGraphByDegree() while i<nbCats and i < len(cats): data.append(str(cats[i])) i += 1 # Write all information collected in the csv file try: print data writer.writerow(data) idx += 1 except: print "Error with data" else: break fr.close() fw.close() # If here, download finished. Stop threads for i in xrange(10): queue.put((None, None))