def updateStatus(status, user, services=['Facebook', 'Buzz', 'Twitter'], settings={}): if status in (None, ""): {"result": "error", "message": "Empty Status"} return urlMatchObj = re.search( "((https?|ftp):\/\/)?([-a-z0-9+&@#\/%?=~_|!:,;]{2,}\.)+[-a-z0-9+&@#\/%=~_|]+", status, re.IGNORECASE) urlMatch = status[urlMatchObj.start():urlMatchObj.end( )] if urlMatchObj else None s = ['Facebook', 'Buzz', 'Twitter'] services = list(set(services) & set(s)) if services == []: return { "result": "error", "message": "Service specification error!" } res = {} statusObj = {"status": status, "settings": settings} if urlMatch: from LinkFetcher import LinkFetcher try: lf = LinkFetcher(urlMatch) statusObj['link'] = {} statusObj['link']['url'] = lf.url() statusObj['link']['description'] = lf.description() statusObj['link']['title'] = lf.title() except Exception, e: pass
class MyTestCase(unittest.TestCase): def setUp(self): self.fetcher = LinkFetcher() def test_linkCount(self): self.assertEqual(14, self.fetcher.linkCount('testLink')) def test_getLinks(self): resultSet = { "/wiki/Suomenlinna", "/wiki/Plac_Targowy_w_Helsinkach", "/wiki/Komisja_Standaryzacji_Nazw_Geograficznych_poza_Granicami_Rzeczypospolitej_Polskiej", "/wiki/Helsinki", "/wiki/Morze_Ba%C5%82tyckie", "/wiki/Suomenlinna", "/wiki/Katedra", "/wiki/Pa%C5%82ac_Prezydencki_w_Helsinkach", "/wiki/Finlandia", "/wiki/Szwecja", "/wiki/Urz%C4%85d_miejski", "/wiki/Wikimedia_Commons", "/wiki/Plac_Targowy_w_Helsinkach", "/wiki/Ambasada", "/wiki/Esplanadi", } links = self.fetcher.getLinks('testLink', 1) self.assertEqual(resultSet, links)
def getURLs(query, key, urlToQueryMap): ''' This function returns Google search result URLs against a query. :param query: A list containing query keywords :param urlToQueryMap: A dictionary that maps URLs to their queries :return: A list of URLs ''' links = [] results = LinkFetcher().getUrlsGoogle(query, key) try: if "items" in results: for item in results["items"]: link = item["link"] links.append(link) if link in urlToQueryMap: urlToQueryMap[link].append(query) else: urlToQueryMap[link] = [query] except Exception as e: logging.exception( "Exception occurred while trying to fetch links for query " + str(query) + ", returned results: " + str(results) + ", Exception: " + str(e)) return links
def __init__(self, db_path, max_urls, seed_url = 'http://python.org'): self.db_path = db_path self.max_urls = max_urls self.seed_url = seed_url self.max_parallel_connections = 5 self.link_fetcher = LinkFetcher() self.connect_db() self.init_db() if self.uncrawled_links_count() == 0: self.seed_db()
class Crawler: """ Web crawling for superhumans! this is superfast, super greedy! """ def __init__(self, db_path, max_urls, seed_url = 'http://python.org'): self.db_path = db_path self.max_urls = max_urls self.seed_url = seed_url self.max_parallel_connections = 5 self.link_fetcher = LinkFetcher() self.connect_db() self.init_db() if self.uncrawled_links_count() == 0: self.seed_db() def start_crawling(self): while not self.stop_condition(): urls = self.get_uncrawled_urls(self.max_parallel_connections) if not urls: break # there are no unvisited urls in the db. time to stop! rdict = self.link_fetcher.fetch(urls) parsed_links = self.link_fetcher.parse(rdict) self.insert_to_db(parsed_links) def insert_to_db(self, links): # remove duplicates, if any! s = set(links) unique_links = list(s) total = len(unique_links) success = 0 failed = 0 for item in unique_links: if self.stop_condition(): break try: print "current strength ", self.total_links_count() Url(url = item) except Exception, fault: print "insert failed. Error ", str(fault) # printing item here is not unicode safe! failed += 1 else: success += 1 return total, success, failed
def updateStatus(status,user,services=['Facebook','Buzz','Twitter'],settings={}): if status in (None,""): {"result":"error","message":"Empty Status"} return urlMatchObj = re.search("((https?|ftp):\/\/)?([-a-z0-9+&@#\/%?=~_|!:,;]{2,}\.)+[-a-z0-9+&@#\/%=~_|]+", status,re.IGNORECASE) urlMatch = status[urlMatchObj.start():urlMatchObj.end()] if urlMatchObj else None s = ['Facebook','Buzz','Twitter'] services = list(set(services) & set(s)) if services == []: return {"result":"error","message":"Service specification error!"} res = {} statusObj = {"status":status,"settings":settings} if urlMatch: from LinkFetcher import LinkFetcher try: lf = LinkFetcher(urlMatch) statusObj['link'] = {} statusObj['link']['url'] = lf.url() statusObj['link']['description'] = lf.description() statusObj['link']['title'] = lf.title() except Exception,e: pass
def setUp(self): self.fetcher = LinkFetcher()