Esempio n. 1
0
    def updateStatus(status,
                     user,
                     services=['Facebook', 'Buzz', 'Twitter'],
                     settings={}):
        if status in (None, ""):
            {"result": "error", "message": "Empty Status"}
            return
        urlMatchObj = re.search(
            "((https?|ftp):\/\/)?([-a-z0-9+&@#\/%?=~_|!:,;]{2,}\.)+[-a-z0-9+&@#\/%=~_|]+",
            status, re.IGNORECASE)
        urlMatch = status[urlMatchObj.start():urlMatchObj.end(
        )] if urlMatchObj else None

        s = ['Facebook', 'Buzz', 'Twitter']
        services = list(set(services) & set(s))
        if services == []:
            return {
                "result": "error",
                "message": "Service specification error!"
            }

        res = {}
        statusObj = {"status": status, "settings": settings}
        if urlMatch:
            from LinkFetcher import LinkFetcher
            try:
                lf = LinkFetcher(urlMatch)
                statusObj['link'] = {}
                statusObj['link']['url'] = lf.url()
                statusObj['link']['description'] = lf.description()
                statusObj['link']['title'] = lf.title()
            except Exception, e:
                pass
Esempio n. 2
0
class MyTestCase(unittest.TestCase):
    def setUp(self):
        self.fetcher = LinkFetcher()

    def test_linkCount(self):
        self.assertEqual(14, self.fetcher.linkCount('testLink'))

    def test_getLinks(self):
        resultSet = {
            "/wiki/Suomenlinna",
            "/wiki/Plac_Targowy_w_Helsinkach",
            "/wiki/Komisja_Standaryzacji_Nazw_Geograficznych_poza_Granicami_Rzeczypospolitej_Polskiej",
            "/wiki/Helsinki",
            "/wiki/Morze_Ba%C5%82tyckie",
            "/wiki/Suomenlinna",
            "/wiki/Katedra",
            "/wiki/Pa%C5%82ac_Prezydencki_w_Helsinkach",
            "/wiki/Finlandia",
            "/wiki/Szwecja",
            "/wiki/Urz%C4%85d_miejski",
            "/wiki/Wikimedia_Commons",
            "/wiki/Plac_Targowy_w_Helsinkach",
            "/wiki/Ambasada",
            "/wiki/Esplanadi",
        }
        links = self.fetcher.getLinks('testLink', 1)
        self.assertEqual(resultSet, links)
Esempio n. 3
0
def getURLs(query, key, urlToQueryMap):
    '''
    This function returns Google search result URLs against a query.
    :param query: A list containing query keywords
    :param urlToQueryMap: A dictionary that maps URLs to their queries
    :return: A list of URLs
    '''
    links = []
    results = LinkFetcher().getUrlsGoogle(query, key)
    try:
        if "items" in results:
            for item in results["items"]:
                link = item["link"]
                links.append(link)
                if link in urlToQueryMap:
                    urlToQueryMap[link].append(query)
                else:
                    urlToQueryMap[link] = [query]
    except Exception as e:
        logging.exception(
            "Exception occurred while trying to fetch links for query " +
            str(query) + ", returned results: " + str(results) +
            ", Exception: " + str(e))

    return links
Esempio n. 4
0
 def __init__(self, db_path, max_urls, seed_url = 'http://python.org'):
     self.db_path        = db_path
     self.max_urls       = max_urls
     self.seed_url       = seed_url
     self.max_parallel_connections = 5
     self.link_fetcher   = LinkFetcher()
     self.connect_db()
     self.init_db()
     if self.uncrawled_links_count() == 0:
         self.seed_db()
Esempio n. 5
0
class Crawler:
    """
    Web crawling for superhumans! this is superfast, super greedy!
    """
    def __init__(self, db_path, max_urls, seed_url = 'http://python.org'):
        self.db_path        = db_path
        self.max_urls       = max_urls
        self.seed_url       = seed_url
        self.max_parallel_connections = 5
        self.link_fetcher   = LinkFetcher()
        self.connect_db()
        self.init_db()
        if self.uncrawled_links_count() == 0:
            self.seed_db()

    def start_crawling(self):
        while not self.stop_condition():
            urls            = self.get_uncrawled_urls(self.max_parallel_connections)
            if not urls:
                break # there are no unvisited urls in the db. time to stop!
            rdict           = self.link_fetcher.fetch(urls)
            parsed_links    = self.link_fetcher.parse(rdict)
            self.insert_to_db(parsed_links)

    def insert_to_db(self, links):
        # remove duplicates, if any!
        s = set(links)
        unique_links = list(s)
        total = len(unique_links)
        success = 0
        failed = 0
        for item in unique_links:
            if self.stop_condition():
                break
            try:
                print "current strength ", self.total_links_count()
                Url(url = item)
            except Exception, fault:
                print "insert failed. Error ", str(fault) # printing item here is not unicode safe!
                failed += 1
            else:
                success += 1
        return total, success, failed
Esempio n. 6
0
    def updateStatus(status,user,services=['Facebook','Buzz','Twitter'],settings={}):
        if status in (None,""):
            {"result":"error","message":"Empty Status"}
            return
        urlMatchObj = re.search("((https?|ftp):\/\/)?([-a-z0-9+&@#\/%?=~_|!:,;]{2,}\.)+[-a-z0-9+&@#\/%=~_|]+", status,re.IGNORECASE)
        urlMatch = status[urlMatchObj.start():urlMatchObj.end()] if urlMatchObj else None
        
        s = ['Facebook','Buzz','Twitter']
        services = list(set(services) & set(s))
        if services == []:
            return {"result":"error","message":"Service specification error!"}

        res = {}
        statusObj = {"status":status,"settings":settings}
        if urlMatch:
            from LinkFetcher import LinkFetcher
            try:
                lf = LinkFetcher(urlMatch)
                statusObj['link'] = {}
                statusObj['link']['url'] = lf.url()
                statusObj['link']['description'] = lf.description()
                statusObj['link']['title'] = lf.title()
            except Exception,e:
                pass
Esempio n. 7
0
 def setUp(self):
     self.fetcher = LinkFetcher()