Esempio n. 1
0
 def request(self, entry_url):
     try:
         #some santity checks
         if self.timeout < 0:
             raise ValueError("Timeout param can take only positive value")
         if type(self.strict) != type(True):
             raise ValueError("Strict param can take only boolean value")
         entry_url = clean_url(entry_url)
         #Wait for some time before raising Timeout exception
         page = requests.get(entry_url, timeout=self.timeout)
         mime_type = page.headers['content-type']
         page.raise_for_status()
         #Stop if the no of pages visited is exceeded
         if self.count_exceeded():
             return
         if (page !=
                 None) and remove_protocol(entry_url) not in self.visits:
             #Add the page to the visits list
             self.visits.add(clean_url(remove_protocol(entry_url)))
             soup = BeautifulSoup(page.text, 'lxml')
             if 'text/html' in mime_type:
                 #If the page is HTML delegate it to HTMLPageRetriever
                 pr = HTMLPageRetriever(self.strict)
                 pr.add_links(self.crawler_queue, self.directory, entry_url,
                              soup)
             elif 'text/xml' in mime_type:
                 #If the page is XML delegate it to SiteMapRetriever
                 sr = SiteMapRetriever(self.strict)
                 sr.add_links(self.crawler_queue, self.directory, entry_url,
                              soup)
             else:
                 return
             print "--> " + entry_url
             return
     except requests.exceptions.ConnectionError as e:
         self.visits.add(clean_url(remove_protocol(entry_url)))
         print "Ignoring " + entry_url + ", URL might be incorrect"
         return None
     except requests.exceptions.Timeout as e:
         self.visits.add(clean_url(remove_protocol(entry_url)))
         print "Ignoring " + entry_url + ", timeout error"
         return None
     except requests.exceptions.RequestException as e:
         self.visits.add(clean_url(remove_protocol(entry_url)))
         print "Ignoring: " + entry_url + ", " + e.message
         return None
     except RuntimeError as e:
         print e
         return None
Esempio n. 2
0
    def search(self, query):
        ret = []
        if query == None or len(query) == 0:
            return ret

        url = root + "/search-all?query=" + query
        u = urllib2.urlopen(url)
        page = u.read()
        result = page.find(product[0])

        while result != -1:
            link_start = page.find(link[0], result + 1) + len(link[0])
            link_end = page.find(link[1], link_start + 1)
            img_start = page.find(img[0], link_end + 1) + len(img[0])
            img_end = page.find(img[1], img_start + 1)
            name_start = page.find(name[0], img_end + 1) + len(name[0])
            name_end = page.find(name[1], name_start + 1)
            price_start = page.find(price[0], name_end + 1) + len(price[0])
            price_end = page.find(price[1], price_start + 1)
            result = page.find(product[0], result + 1)

            url = root + page[link_start:link_end]
            cleaned_url = clean_url(url)

            d = {
                "link": cleaned_url,
                "img": page[img_start:img_end],
                "name": page[name_start:name_end],
                "price": int(page[price_start:price_end]),
            }
            ret.append(d)

        u.close()
        return ret
Esempio n. 3
0
    def get_uri(self, query, params=None, **kwargs):
        """Get the the request url"""
        if isinstance(query, basestring):
            query = YQLQuery(query)
        query_params = self.get_query_params(query, params, **kwargs)

        token = kwargs.get("token")

        if hasattr(token, "yahoo_guid"):
            query_params["oauth_yahoo_guid"] = getattr(token, "yahoo_guid")

        if not token:
            raise ValueError("Without a token three-legged-auth cannot be"
                                                            " carried out")

        yql_logger.debug("query_params: %s", query_params)
        http_method = query.get_http_method()
        url = self.endpoint
        oauth_request = oauth.Request.from_consumer_and_token(
                                    self.consumer, http_url=url,
                                    token=token, parameters=query_params,
                                    http_method=http_method)
        yql_logger.debug("oauth_request: %s", oauth_request)
        # Sign request
        sig = self.get_signature(url)
        oauth_request.sign_request(sig, self.consumer, token)
        yql_logger.debug("oauth_signed_request: %s", oauth_request)
        url = oauth_request.to_url()
        url = clean_url(url)
        return url.replace('+', '%20').replace('%7E', '~')
Esempio n. 4
0
 def _insert_missing_har_urls(self, urls):
     result = dict()
     if len(urls) > 0:
         custom_condiction = ' WHERE url=?'
         for i in range(1, len(urls)):
             urls[i] = utils.clean_url(urls[i], False)
             custom_condiction += ' or url=? '
         tmp = self.custom_select_from_table('har_urls', ['url', 'id'],
                                             custom_condiction, tuple(urls))
         for row in tmp:
             result[row[0]] = row[1]
             urls.remove(row[0])
         if len(urls) > 0:
             urls_to_insert = [(url, 0) for url in urls]
             self.insert_data('har_urls', ['url', 'is_advertising'],
                              urls_to_insert)
             custom_condiction = ' WHERE url=?'
             for i in range(1, len(urls)):
                 custom_condiction += ' or url=? '
             tmp = self.custom_select_from_table('har_urls', ['url', 'id'],
                                                 custom_condiction,
                                                 tuple(urls))
             for row in tmp:
                 result[row[0]] = row[1]
     return result
Esempio n. 5
0
 def compose_tweets(self, message):
     words = message.split(" ")
     tweets = []
     current = ""
     running_total = 0
     for index in range(len(words)):
         word = words[index].strip()
         next_word = None
         current += clean_url(word)
         running_total += self.get_length_for_word(word)
         # print '[{}] "{}"'.format(running_total, current.encode('utf-8'))
         if index + 1 < len(words):
             next_word = words[index + 1].strip()
         current += " "
         running_total += 1
         if next_word:
             if running_total + self.get_length_for_word(next_word) > 130:
                 tweets.append(current)
                 current = ""
                 running_total = 0
         else:
             tweets.append(current)
             break
     if len(tweets) > 1:
         for index in range(len(tweets)):
             tweets[index] += "({}/{})".format(index + 1, len(tweets))
     return tweets
Esempio n. 6
0
    def generate_chain(self, message):
        """Generates a Markov chain from a message"""
        words = message.split()
        words.append(self.STOPWORD)
        words.insert(0, self.STOPWORD)

        # find URLs, neaten them up
        for i in range(0, len(words)):
            words[i] = clean_url(words[i])
        if '<{}>'.format(self.users[self.BOT_ID]) in words[1]:
            del words[1]

        if len(words) < 2:
            return ''

        # remove stuff we don't know
        wordpair = ''
        index = 0
        seedcandidates = []
        while index < len(words) - 1:
            wordpair = words[index] + ' ' + words[index + 1]
            if self.dictionary.has_key(wordpair):
                seedcandidates.append(wordpair)
            index = index + 1
        if len(seedcandidates) == 0:
            return ''

        chain = ''

        seed = random.choice(seedcandidates)

        # forwards
        wordpair = seed
        if self.dictionary.has_key(wordpair):
            chain = wordpair
        #print wordpair
        while (wordpair.split()[1] != self.STOPWORD) and (self.dictionary.has_key(wordpair)):
            wordpair = wordpair.split()[1] + ' ' + \
                        choose_word_from_list(self.dictionary.get(wordpair)[1])
            #print wordpair
            chain = chain + ' ' + wordpair.split()[1]

        # backwards
        wordpair = seed
        if self.dictionary.has_key(wordpair) and wordpair.split()[0] != self.STOPWORD:
            wordpair = choose_word_from_list(
                self.dictionary.get(wordpair)[0]) + \
                ' ' + wordpair.split()[0]
        # so we don't have the seed twice


        while (wordpair.split()[0] != self.STOPWORD) and (self.dictionary.has_key(wordpair)):
            #print wordpair
            chain = wordpair.split()[0] + ' ' + chain
            wordpair = choose_word_from_list(
                self.dictionary.get(wordpair)[0]) + \
                ' ' + wordpair.split()[0]

        return chain.replace(self.STOPWORD, '')
Esempio n. 7
0
 def get_uri(self, query, params=None, **kwargs):
     """Get the the request url"""
     if isinstance(query, basestring):
         query = YQLQuery(query)
     params = self.get_query_params(query, params, **kwargs)
     query_string = urlencode(params)
     uri = '%s?%s' % (self.endpoint, query_string)
     uri = clean_url(uri)
     return uri
Esempio n. 8
0
def get_username_board(url):
    url = clean_url(url)
    m = re.search('pinterest.[a-zA-Z.]+?/([^/]+)/([^#\\?]+)', url)
    username, board = m.groups()
    board = urllib.parse.unquote(board).strip()
    while board.endswith('/'):
        board = board[:-1].strip()

    return (username, board)
Esempio n. 9
0
 def get_uri(self, query, params=None, **kwargs):
     """Get the the request url"""
     if isinstance(query, basestring):
         query = YQLQuery(query)
     query_params = self.get_query_params(query, params, **kwargs)
     http_method = query.get_http_method()
     request = self.__two_legged_request(parameters=query_params,
                                         method=http_method)
     url = request.to_url()
     return clean_url(url)
Esempio n. 10
0
 def same_domain_cleanup(self, entry_url, url):
     #Do not parse the URL query string
     url = (url.split('?')[0]) if (url != None) else url
     #Do not use tokens following #, since it redirects to the same page
     url = (url.split('#')[0]) if (url != None) else url
     if url.startswith('javascript'):
         url = None
     elif url == '/' or url == '':
         url = None
     elif url.startswith('//'):
         url = 'http:' + url
     elif url.startswith("mailto:"):
         #If its email links, ignore
         url = None
     #Same domain or relative URL cleanup and convertion into proper URL string
     elif url.startswith('/'):
         url = clean_url(get_base_url(entry_url)) + url
     elif url.startswith('./'):
         url = clean_url(entry_url) + clean_url(url.replace(".", ""))
     return url
Esempio n. 11
0
def make_request(url):
    out['text'] = 'Aguarde...Carregando...'
    response = None
    try:
        response = requests.get(clean_url(url))
    except requests.exceptions.RequestException as e:
        # catastrophic error. bail.
        out['text'] = 'Erro na Requisicao'

    requestLinkArray = RequestLinkArray(url, response)
    out['text'] = requestLinkArray
Esempio n. 12
0
 def _retrieve_outbound_links(self):
     result = dict()
     principal_domain = utils.get_principal_domain(self._url)
     regex = "//*[@href and not(@href [contains(., '%s')])]" % principal_domain
     elements_with_urls = self._tree_explorer.xpath(self.body_node, regex)
     for element in elements_with_urls:
         href = element.attrib['href']
         if utils.is_valid_url(href):
             href = utils.clean_url(href)
             if href not in result:
                 result[href] = ''
     return list(result.keys())
Esempio n. 13
0
 def _prepare_tuple_failed_work(self, work_data_container):
     url = utils.clean_url(work_data_container.url, False)
     scheme, url = utils.split_url_and_scheme(url)
     scraped_flag = work_data_container.scraped
     attempts_count = work_data_container.attempts_count
     mime_type = work_data_container.mime_type
     response_code = work_data_container.http_response_code
     url_to_refer = work_data_container.url_to_refer
     error_text = work_data_container.error_text
     return scraped_flag, attempts_count, mime_type, response_code, None, url_to_refer, \
      None, False, None, None, None, \
      None, None, None, None, None, error_text, url, 0
Esempio n. 14
0
 def _get_canonical_url(self):
     result = None
     try:
         tmp_res = self.driver.find_element_by_xpath(
             '//link[@rel="canonical" and @href]')
         if tmp_res:
             href = tmp_res.get_attribute("href")
             if href:
                 # domain = utils.get_principal_domain(self.current_url)
                 result = href
     except NoSuchElementException:
         pass
     except TimeoutException:
         pass
     except Exception:
         pass
     if result is None:
         try:
             tmp_res = self.driver.find_element_by_xpath(
                 '//meta[@property="og:url"]|//meta[@name="twitter:url"]')
             result = tmp_res.get_attribute('content')
         except NoSuchElementException:
             pass
         except TimeoutException:
             pass
         except Exception:
             pass
     if result:
         result = utils.clean_url(result, False)
         tmp = utils.clean_url(self.current_url, False)
         scheme, u = utils.split_url_and_scheme(tmp)
         if result.startswith(r'//'):
             result = '{}:{}'.format(scheme, result)
         elif result.startswith(r'/'):
             domain = '{}://{}'.format(scheme,
                                       utils.get_principal_domain_www(tmp))
             result = '{}{}'.format(domain, result)
         if not utils.is_valid_url_to_navigate(result):
             result = None
     return result
Esempio n. 15
0
 def _prepare_tuple_without_article(self, work_data_container):
     har = None
     url = utils.clean_url(work_data_container.url, False)
     scheme, url = utils.split_url_and_scheme(url)
     scraped_flag = work_data_container.scraped
     attempts_count = work_data_container.attempts_count
     mime_type = work_data_container.mime_type
     response_code = work_data_container.http_response_code
     url_to_refer = work_data_container.url_to_refer
     pagecontent = work_data_container.page_content_container
     return scraped_flag, attempts_count, mime_type, \
      response_code, pagecontent.language, url_to_refer, pagecontent.text,\
      False, None, None, None, None, None, None, None, har, None, url, 0
Esempio n. 16
0
    def _extract_links_from_a_tags_in_text(self, text):
        """
        Extract supplement links from the html text that contains <a> tags
        with href attribute.

        @param text: HTML text.
        @type text: str

        @return: Dictionary with supplement links grouped by extension.
        @rtype: {
            '<extension1>': [
                ('<link1>', '<title1>'),
                ('<link2>', '<title2')
            ],
            'extension2': [
                ('<link3>', '<title3>'),
                ('<link4>', '<title4>')
            ]
        }
        """
        soup = BeautifulSoup(text)
        links = [item['href'].strip()
                 for item in soup.find_all('a') if 'href' in item.attrs]
        links = sorted(list(set(links)))
        supplement_links = {}

        for link in links:
            filename, extension = os.path.splitext(clean_url(link))
            # Some courses put links to sites in supplement section, e.g.:
            # http://pandas.pydata.org/
            if extension is '':
                continue

            # Make lowercase and cut the leading/trailing dot
            extension = clean_filename(
                extension.lower().strip('.').strip(),
                self._unrestricted_filenames)
            basename = clean_filename(
                os.path.basename(filename),
                self._unrestricted_filenames)
            if extension not in supplement_links:
                supplement_links[extension] = []
            # Putting basename into the second slot of the tuple is important
            # because that will allow to download many supplements within a
            # single lecture, e.g.:
            # 01_slides-presented-in-this-module.pdf
            # 01_slides-presented-in-this-module_Dalal-cvpr05.pdf
            # 01_slides-presented-in-this-module_LM-3dtexton.pdf
            supplement_links[extension].append((link, basename))

        return supplement_links
Esempio n. 17
0
    def _extract_links_from_a_tags_in_text(self, text):
        """
        Extract supplement links from the html text that contains <a> tags
        with href attribute.

        @param text: HTML text.
        @type text: str

        @return: Dictionary with supplement links grouped by extension.
        @rtype: {
            '<extension1>': [
                ('<link1>', '<title1>'),
                ('<link2>', '<title2')
            ],
            'extension2': [
                ('<link3>', '<title3>'),
                ('<link4>', '<title4>')
            ]
        }
        """
        soup = BeautifulSoup(text)
        links = [
            item['href'].strip() for item in soup.find_all('a')
            if 'href' in item.attrs
        ]
        links = sorted(list(set(links)))
        supplement_links = {}

        for link in links:
            filename, extension = os.path.splitext(clean_url(link))
            # Some courses put links to sites in supplement section, e.g.:
            # http://pandas.pydata.org/
            if extension is '':
                continue

            # Make lowercase and cut the leading/trailing dot
            extension = clean_filename(extension.lower().strip('.').strip(),
                                       self._unrestricted_filenames)
            basename = clean_filename(os.path.basename(filename),
                                      self._unrestricted_filenames)
            if extension not in supplement_links:
                supplement_links[extension] = []
            # Putting basename into the second slot of the tuple is important
            # because that will allow to download many supplements within a
            # single lecture, e.g.:
            # 01_slides-presented-in-this-module.pdf
            # 01_slides-presented-in-this-module_Dalal-cvpr05.pdf
            # 01_slides-presented-in-this-module_LM-3dtexton.pdf
            supplement_links[extension].append((link, basename))

        return supplement_links
Esempio n. 18
0
        def _add_asset(name, url, destination):
            filename, extension = os.path.splitext(clean_url(name))
            if extension is '':
                return

            extension = clean_filename(extension.lower().strip('.').strip(),
                                       self._unrestricted_filenames)
            basename = clean_filename(os.path.basename(filename),
                                      self._unrestricted_filenames)
            url = url.strip()

            if extension not in destination:
                destination[extension] = []
            destination[extension].append((url, basename))
Esempio n. 19
0
        def _add_asset(name, url, destination):
            filename, extension = os.path.splitext(clean_url(name))
            if extension is '':
                return

            extension = clean_filename(
                extension.lower().strip('.').strip(),
                self._unrestricted_filenames)
            basename = clean_filename(
                os.path.basename(filename),
                self._unrestricted_filenames)
            url = url.strip()

            if extension not in destination:
                destination[extension] = []
            destination[extension].append((url, basename))
Esempio n. 20
0
    def add_links(self, process_queue, directory, entry_url, soup):
        """
        Extract the valid links from the provided URL

        Parameters
        ----------

        process_queue: Queue

        directory: dictionary
            Dictionary to store the parent and child url mappings

        entry_url: string
            URL to extract the hyperlinks

        soup : BeautifulSoup extract            
        
        """
        link_set = soup.find_all("url")
        #If there is no url tag found in XML file, it is not a sitemap / malformed
        if link_set == None:
            raise RuntimeWarning("malformed sitemap")

        directory[entry_url] = set()
        domain = remove_protocol(get_base_url(entry_url))

        for link in link_set:
            #Find the url tags from XML and clean them for relative paths
            link = link.findNext("loc")
            if link != None:
                url = self.same_domain_cleanup(entry_url, link.text)
                if (url != None):
                    #If strict flag is set, ignore the url from other domains
                    if (self.strict_domain == True):
                        if (remove_protocol(get_base_url(url)) != domain):
                            continue
                    if (url != entry_url):
                        #Load all the child URLs for further processing
                        process_queue.put(clean_url(url))
                        #Register a page, along with it's child URL's, to be shown / saved as file
                        directory[entry_url].add(url)
        if link_set == None:
            raise RuntimeError("Ignoring " + entry_url +
                               ", malformed xml/sitemap")
        #Converting set into list for serialising
        directory[entry_url] = list(directory[entry_url])
Esempio n. 21
0
 def _parse_rss_entry(self, entry, language, feed_sections):
     title = get_attr_dinamically(entry, 'title')
     link = get_attr_dinamically(entry, 'link')
     link = utils.clean_url(link, remove_arguments=False)
     article_date = self._get_parsed_dates_from_object(
         entry, 'published_parsed')
     article_container = ArticleContainer(url=link,
                                          title=title,
                                          publish_date=article_date,
                                          top_img=None,
                                          sections=[feed_sections])
     extracted = PageContentContainer(None,
                                      url=link,
                                      article_c=article_container,
                                      language=language)
     self.data_collector.add_extracted_data(
         link, 0, 0, 'text/html', 0, page_content_container=extracted)
Esempio n. 22
0
    def process(self):
        try:
            if self.url_count == None:
                self.set_maximum()

            #Process until interrupted / count is matched
            while True:
                #Limit the number of threads to the user specified value
                #enumerate always return the child threads along with the main thread, ignore the later
                if (len(threading.enumerate()) - 1) < self.multi:
                    #If count is exceeding, break the process
                    if self.count_exceeded():
                        break
                    #Wait until a mintue, to retrieve from queue
                    page = clean_url(
                        self.crawler_queue.get(block=True,
                                               timeout=self.timeout))
                    # If node hasn't been visited yet, proceed
                    if remove_protocol(page) not in self.visits:
                        try:
                            #Collect URL's from the page
                            threading.Thread(target=self.request,
                                             args=[page]).start()
                        except Exception as e:
                            print e.message
                            break
                else:
                    #Give some time for the threads to finish, since we should restrict the threads being spawned to the user-specified value
                    time.sleep(2)

        except KeyboardInterrupt as e:
            print '\n\n---------------------------'
            print "\nFinishing the running jobs..."
            print '\n-----------------------------'
        except Queue.Empty as e:
            print "\nDone.."
        except ValueError as e:
            print e.message
        finally:
            #Join all existing threads to main thread.
            for thread in threading.enumerate():
                if thread is not threading.currentThread():
                    thread.join(self.timeout)
        return self.directory
Esempio n. 23
0
 def retrieve_domain_links(self):
     result = dict()
     # principal_domain = utils.get_principal_domain(self._url)
     # regex = "//*[regexp:test(@href, '^(https?://)?(www\.)?.*%s', 'i')]" % principal_domain
     # elements_with_urls = self.root.xpath(regex)
     expression = "//a[contains(@href, '%s')]" % self.domain
     elements_with_urls = self._tree_explorer.xpath(self.body_node,
                                                    expression)
     for element in elements_with_urls:
         href = element.attrib['href']
         href = utils.clean_url(href,
                                remove_arguments=False,
                                domain=self.domain,
                                scheme=self.scheme)
         if utils.is_valid_url_to_navigate(href):
             if utils.is_domain_link(href, self.domain):
                 if href not in result:
                     result[href] = ''
     return list(result.keys())
Esempio n. 24
0
 def _prepare_tuple_with_article(self, work_data_container):
     har = None
     url = utils.clean_url(work_data_container.url, False)
     scheme, url = utils.split_url_and_scheme(url)
     scraped_flag = work_data_container.scraped
     attempts_count = work_data_container.attempts_count
     mime_type = work_data_container.mime_type
     response_code = work_data_container.http_response_code
     url_to_refer = work_data_container.url_to_refer
     pagecontent = work_data_container.page_content_container
     art_container = pagecontent.article_c
     videos = ','.join(art_container.videos)
     authors = ','.join(art_container.authors)
     sections = ','.join(art_container.sections)
     publish_date = art_container.publish_date
     if publish_date and isinstance(publish_date, datetime.datetime):
         publish_date = utils.convert_datetime_to_format_str(publish_date)
     return scraped_flag, attempts_count, mime_type, response_code, pagecontent.language, url_to_refer, \
      pagecontent.text, True, art_container.title,\
      art_container.text, publish_date, \
      art_container.top_img, videos, authors, sections, har, None, url, 0
Esempio n. 25
0
def analyze_entry(site, e):
    if hasattr(e, 'published_parsed') and e.published_parsed:
        timestamp = time.mktime(e.published_parsed)
    elif hasattr(e, 'updated_parsed') and e.updated_parsed:
        timestamp = time.mktime(e.updated_parsed)
    else:
        return False

    if timestamp > time.time(): timestamp = time.time()

    if hasattr(e, "content"): content = e.summary
    else: content = e.description

    try:
        g = re.search(r'Noticia en Men&eacute;ame: (http://menea.me/(\w+)) ',
                      content)
    except:
        return False

    if g:
        id = int(g.group(2), 36)
        original_url = g.group(1)
        entry = dict()
        entry["site"] = site.encode('ascii', 'xmlcharrefreplace')
        entry["title"] = e.title.encode('ascii', 'xmlcharrefreplace')
        entry['url'] = clean_url(e.link)
        entry['ts'] = int(timestamp)
        entry['id'] = id
        res = store(site, entry)
        if res:
            post = "%s: &laquo;%s&raquo; %s (%s)" % (
                entry["site"], entry["title"], entry['url'], original_url)
            print "Posting", post
            if not post_note(post):
                print "Error posting"
            return res

    return False
    def add_links(self, process_queue, directory, entry_url, soup):
        """
        Extract the valid links from the provided URL

        Parameters
        ----------

        process_queue: Queue

        directory: dictionary
            Dictionary to store the parent and child url mappings

        entry_url: string
            URL to extract the hyperlinks

        soup : BeautifulSoup extract            
        
        """ 
        
        link_set = soup.find_all('a', href=True)
        directory[entry_url] = set()
        domain = remove_protocol(get_base_url(entry_url))
        for link in link_set:
            #Find the urls from the soup extract and clean them for relative paths
            url = self.same_domain_cleanup(entry_url, link.get('href'))
            if (url != None):
                #If strict flag is set, ignore the url from other domains
                if (self.strict_domain == True): 
                    if (remove_protocol(get_base_url(url)) != domain):
                        continue
                if (url != entry_url):
                    #Load all the child URLs for further processing
                    process_queue.put(clean_url(url))
                    #Register a page, along with it's child URL's, to be shown / saved as file
                    directory[entry_url].add(url)
        #Converting set into list for serialising    
        directory[entry_url] = list(directory[entry_url])
Esempio n. 27
0
def analyze_entry(site, e):
		if hasattr(e, 'published_parsed') and e.published_parsed:
			timestamp = time.mktime(e.published_parsed)
		elif hasattr(e, 'updated_parsed') and e.updated_parsed:
			timestamp = time.mktime(e.updated_parsed)
		else:
			return False

		if timestamp > time.time(): timestamp = time.time()

		if hasattr(e, "content"): content = e.summary
		else: content = e.description

		try:
			g = re.search(r'Noticia en Men&eacute;ame: (http://menea.me/(\w+)) ', content)
		except:
			return False

		if g:
			id = int(g.group(2), 36)
			original_url = g.group(1)
			entry = dict()
			entry["site"] = site.encode('ascii', 'xmlcharrefreplace')
			entry["title"] = e.title.encode('ascii', 'xmlcharrefreplace')
			entry['url'] =  clean_url(e.link)
			entry['ts'] = int(timestamp)
			entry['id'] = id
			res = store(site, entry)
			if res:
				post = "%s: &laquo;%s&raquo; %s (%s)" % (entry["site"], entry["title"], entry['url'], original_url)
				print "Posting", post
				if not post_note(post):
					print "Error posting"
				return res

		return False
Esempio n. 28
0
        input = open('Markov_Dict.pkl', 'r')
        self.dictionary = pickle.load(input)
        input.close()
	    self.dictLock.release()

    def toggle_learn(self):
        """Toggles the learning state"""
        self.isLearning = not self.isLearning

    def clean_urls_in_dictionary(self):
	    self.dictLock.acquire()
        newdict = copy.deepcopy(self.DEFAULT_DICTIONARY)
        for key in self.dictionary:
            firsts = self.dictionary.get(key)[0]
            for i in range(0, len(firsts)):
                firsts[i] = (clean_url(firsts[i][0]), firsts[i][1])
            seconds = self.dictionary.get(key)[1]
            for i in range(0, len(seconds)):
                seconds[i] = (clean_url(seconds[i][0]), seconds[i][1])
	    newkey = clean_url(key.split()[0])
	    if len(key.split()) > 1:
		newkey = newkey + ' ' + clean_url(key.split()[1])
            newdict[newkey] = (firsts, seconds)
        self.dictionary = newdict
	self.dictLock.release()

def word_index_in_list(findword, word_list):
    """Get the index of a word in a list"""
    for index in range(len(word_list)):
        if word_list[index][0] == findword:
            return index
Esempio n. 29
0
    def page_ad_matching(self, url):
        """
        Function that matches URL to existing advertisers by relevance scores.
        :param url: Input URL
        :return: Advertisers, sorted by relevance scores.
        """
        self.logger.debug('Keyword_2_company: ' + str(self.keyword_to_company))
        self.logger.debug('ADVECS: ' + str(self.ad_keywords))

        if url in self.articles:
            cleaned = self.articles[url]
        else:
            cleaned = clean_url(url)

        text = cleaned['title'] + '. ' + cleaned['text']

        # if self.params['use_sentiment']:
        #     candidates = self.all_companies.copy()
        # else:
        #     candidates = self.all_companies_scalar.copy()

        # triggers = {}

        doc_vec = {}
        bid_companies = set()
        blacklisted_companies = set()

        scores = self.all_companies_scalar.copy()

        # Fetching target companies.
        targets = self.get_target_company(cleaned['title'])
        self.logger.debug('-----------' + str(targets))
        self.logger.debug('-----------' + cleaned['title'])

        # Reverting back to regular analysis if no targets are found.
        if len(targets) == 0:
            self.params['targeted_sent'] = False

        first_chunk = True
        sentiment = np.array([0, 0])

        for chunk in self.tokenizer.get_chunk(text):
            # Sentiment Extraction from Chunk
            if self.params['use_sentiment']:
                sentiment, tokens = self.analyzer_obj.get_sentiment_from_text(
                    chunk)
                if first_chunk:
                    # Fixed attention weight for the first chunk.
                    sentiment = 2 * sentiment
                self.logger.debug('Sentiment extracted: ' + str(sentiment))
                self.logger.debug(chunk)
            else:
                tokens = self.analyzer_obj.preprocess(chunk,
                                                      join_negatives=False)

            for idx, token in enumerate(tokens):
                # Bidding Companies
                try:
                    companies = self.keyword_to_company[token]
                    self.logger.debug('Found trigger: ' + token)
                    if self.params['use_sentiment']:
                        if not np.any(sentiment):
                            # Neutral Sentiment
                            sentiment += np.array([0.5, 0])
                        APNEA.add_to_dict(doc_vec, token, sentiment)
                    else:
                        APNEA.add_to_dict(doc_vec, token, 1)

                    bid_companies |= companies
                except KeyError:
                    pass

                try:
                    # Blacklist companies.
                    companies = self.blacklist_to_company[token]
                    blacklisted_companies |= companies
                except KeyError:
                    try:
                        # Handling 2 worded blacklist phrases.
                        if idx + 1 < len(tokens):
                            companies = self.blacklist_to_company[token + ' ' +
                                                                  tokens[idx +
                                                                         1]]
                            blacklisted_companies |= companies
                    except KeyError:
                        pass

            if first_chunk:
                first_chunk = False

        # Calculating the Relevance Scores, based on the system configuration.
        for c in bid_companies:
            ad_vec = self.ad_keywords[c]
            doc_vec_copy = doc_vec.copy()
            if self.params['use_sentiment']:
                for k in doc_vec_copy:
                    if self.params['targeted_sent']:
                        if c in targets:
                            if self.params[
                                    'neg_sent'] and k in ad_vec and self.ad_negatives[
                                        c][k]:
                                # Targeted but Sentiment Insensitive.
                                doc_vec_copy[k] = abs(
                                    self.score_sentiment(
                                        doc_vec_copy[k],
                                        method=self.params['scorer']))
                            else:
                                # Targeted and Sentiment Sensitve
                                doc_vec_copy[k] = self.score_sentiment(
                                    doc_vec_copy[k],
                                    method=self.params['scorer'])
                        else:
                            # Non-targets are taken as absolute values
                            doc_vec_copy[k] = abs(
                                self.score_sentiment(
                                    doc_vec_copy[k],
                                    method=self.params['scorer']))
                    else:
                        if self.params[
                                'neg_sent'] and k in ad_vec and self.ad_negatives[
                                    c][k]:
                            # Sentiment Insensitive
                            doc_vec_copy[k] = abs(
                                self.score_sentiment(
                                    doc_vec_copy[k],
                                    method=self.params['scorer']))
                        else:
                            # Sentiment Sensitve
                            doc_vec_copy[k] = self.score_sentiment(
                                doc_vec_copy[k], method=self.params['scorer'])

            self.logger.debug('Company========>: ' + str(c))
            self.logger.debug('ad_vec: ' + str(ad_vec))
            self.logger.debug('doc_vec: ' + str(doc_vec))
            self.logger.debug('doc_vec_copy: ' + str(doc_vec_copy))

            scores[c] = self.cosine(ad_vec, doc_vec_copy)

        # Handling blacklist companies.
        if self.params['blacklist']:
            for blacklisted_company in blacklisted_companies:
                scores[blacklisted_company] = -(abs(
                    scores[blacklisted_company]))

        self.logger.debug('scores: ' + str(scores))

        # Scoring and Sorting.
        scored = sorted(scores.iteritems(), key=lambda x: x[1], reverse=True)
        return scored
Esempio n. 30
0
 def start_requests(self):
     for url in self.urls:
         yield scrapy.Request(url=utils.clean_url(url),
                              callback=self.parse_url,
                              meta={'key': url})
Esempio n. 31
0
 def get(self, url, https=False):
     url = clean_url(url, self.base_url, https=https)
     self.driver.get(url)
Esempio n. 32
0
        try:
            source = newspaper.build(url, config=config)
        except Exception as e:
            if PY_ENV == 'development':
                print('(SOURCE ERROR) Source Skipped\n')
            insert_log(source_id, 'sourceCrawl', 'error',
                       float(time.clock() - src_start_time), {
                           'errorMessage': 'SOURCE ERROR',
                           'crawlerName': 'credible crawler'
                       })
            continue

        error_articles = []
        prev_uuid = ''
        for article in source.articles:
            url_uuid = get_uuid(clean_url(article.url))
            article.id = url_uuid

            if prev_uuid == url_uuid:
                continue

            if get_one(url_uuid, 'errorArticles') or get_one(
                    url_uuid, 'articles'):
                print('Skipped: ' + article.url)
                error_articles.append(article.id)

            prev_uuid = url_uuid

        source.articles = [
            a for a in source.articles if a.id not in error_articles
        ]
Esempio n. 33
0
 def engage(self):
     print "\nCrawler engaged, to interrupt press Ctrl+C or Command + dot/period"
     print "-" * 20
     #Initiate the crawler, by placing the first URL in the processing queue
     self.crawler_queue.put(clean_url(self.root))
     return self.process()
Esempio n. 34
0
    def do_commands(self, target, sender, message, sentByAdmin):
        if sentByAdmin and ('!saveDict' in message):
            try:
                self.save_dictionary()
                self.send_message(target, 'DICTIONARY SAVED SUCCESSFULLY')
            except IOError:
                self.send_message(target, 'DICTIONARY COULD NOT BE SAVED')
            return True
        elif sentByAdmin and ('!loadDict' in message):
            try:
                self.load_dictionary()
                self.send_message(target, 'DICTIONARY LOADED SUCCESSFULLY')
            except IOError:
                self.send_message(target, 'DICTIONARY COULD NOT BE LOADED')
            return True
        elif sentByAdmin and ('!eraseDict' in message):
            self.dictionary = {
                self.STOPWORD : ([self.STOPWORD], [self.STOPWORD])
            }
            self.send_message(target, 'DICTIONARY ERASED (NOT SAVED YET)')
            return True
        elif sentByAdmin and ('!learn' in message):
            self.toggle_learn()
            print_message = 'I AM {} LEARNING'
            self.send_message(target,
                              print_message.format('NOW' if self.isLearning else 'NO LONGER'))
            return True
        elif sentByAdmin and ('!cleanURL' in message):
            self.clean_urls_in_dictionary()
            self.send_message(target, 'LINKS IN DICTIONARY HAVE BEEN CLEANED')
            return True
        elif '!search' in message:
            try:
                message = message.lower()
                searchterms = message.split()[1:]
		for i in range(0, len(searchterms)):
		    searchterms[i] = clean_url(searchterms[i])
                if len(searchterms) == 1:
                    phrases = []
                    for key in self.dictionary:
                        if searchterms[0] == key.split()[0] or \
                                             (len(key.split()) > 1 and \
                                             searchterms[0] == key.split()[1]):
                            phrases.append(key)
                    self.send_message(target, '"%s" in pairs: %s' % (searchterms[0], str(phrases)))
                else:
                    key = searchterms[0] + ' ' + searchterms[1]
                    if self.dictionary.has_key(key):
                        self.send_message(target, '"%s": %s' % (key, str(self.dictionary.get(key))))
                    else:
                        self.send_message(target, '"%s" not found in dictionary' % key)
            except IndexError:
                self.send_message(target, 'MALFORMED COMMAND')
            return True
        elif '!talkback' in message:
            try:
                self.talkBackFreq = float(message.split()[1])
                self.send_message(target, ('RESPONDING PROBABILITY SET TO %3f' % self.talkBackFreq))
            except (IndexError, TypeError):
                self.send_message(target, 'MALFORMED COMMAND')
            return True
        elif sentByAdmin and ('!quit' in message):
            self.quit()
            return True
        elif '!avatar' in message:
            self.send_message(target, 'SOURCE OF MY CURRENT AVATAR: %s' % self.AVATARSOURCE)
            return True

        elif ('!nowplaying' in message):
           songname, songartist = self.generate_song()
           self.send_message(target, 'Now Playing: "%s", by %s' % (string.capwords(songname), string.capwords(songartist)))
           return True

        return False # did not find a command
Esempio n. 35
0
 def init(self):
     self.url = clean_url(self.url)
Esempio n. 36
0
    def init(self):
        self.url = clean_url(self.url)
        url = self.url

        # Determine the type
        if 'bookmark.php?type=user' in url or url.startswith(headers['following']):
            type = 'following'
        elif 'bookmark.php' in url or url.startswith(headers['bookmark']) or '/bookmarks/' in url:
            type = 'bookmark'
        elif 'illust_id=' in url or url.startswith(headers['illust']) or '/artworks/' in url:
            type = 'illust'
        elif 'search.php' in url or url.startswith(headers['search']):
            type = 'search'
            order = query_url(url).get('order', ['date_d'])[0] # data_d, date, popular_d, popular_male_d, popular_female_d
            scd = query_url(url).get('scd', [None])[0] # 2019-09-27
            ecd = query_url(url).get('ecd', [None])[0] # 2019-09-28
            blt = query_url(url).get('blt', [None])[0] # 5000
            bgt = query_url(url).get('bgt', [None])[0] # 9999
            type_ = query_url(url).get('type', [None])[0] # None (all), illust, manga, ugoira
            self.info = {'order': order, 
               'scd': scd, 
               'ecd': ecd, 
               'blt': blt, 
               'bgt': bgt, 
               'type': type_}
        elif '/tags/' in url:
            type = 'search'
            order = query_url(url).get('order', ['date_d'])[0]
            scd = query_url(url).get('scd', [None])[0]
            ecd = query_url(url).get('ecd', [None])[0]
            blt = query_url(url).get('blt', [None])[0]
            bgt = query_url(url).get('bgt', [None])[0]
            type_ = query_url(url).get('type', [None])[0] # None (all), illust, manga, ugoira
            if type_ is None:
                try:
                    type_ = url.split('/tags/')[1].split('/')[1]
                except IndexError:
                    type_ = None
                type_ = {'illustrations': 'illust'}.get(type_, type_)
            self.info = {'order': order, 
               'scd': scd, 
               'ecd': ecd, 
               'blt': blt, 
               'bgt': bgt, 
               'type': type_}
        elif 'id=' in url and 'mode=' not in url or url.startswith(headers['user']) or 'pixiv.me' in url or '/users/' in url:
            type = 'user'
        else:
            self.Invalid((u'[pixiv] Can not determine type: {}').format(url))
            return 'stop'
        header = headers[type]
        if 'pixiv.net' in url or 'pixiv.me' in url:
            if not url.startswith('http://') and not url.startswith('https://'):
                url = u'https://' + url
            self.url = url
        else:
            url = url.replace('bmk_', '').replace('illust_', '').replace('pixiv_', '').replace('search_', '')
            if type == 'user':
                url = 'https://www.pixiv.net/member_illust.php?id={}'.format(url)
            elif type == 'bookmark':
                url = 'https://www.pixiv.net/bookmark.php?id={}'.format(url)
            elif type == 'illust':
                url = 'https://www.pixiv.net/member_illust.php?mode=medium&illust_id={}'.format(url)
            elif type == 'search':
                url = 'https://www.pixiv.net/search.php?s_mode=s_tag&word={}'.format(url)
                url = clean_url(url)
            else:
                self.Invalid('{}{}: ???'.format(header, url))
                return 'stop'
            self.url = url
        self.print_('PIXIV_TYPE: {}'.format(type))
        self.pixiv_type = type
        try:
            self.api = pixiv_auth.get_api()
            if 'error' in self.api.user_detail(11):
                self.api = pixiv_auth.get_api(force=True)
        except Exception as e:
            self.print_(print_error(e)[0])
            self.Invalid(tr_('로그인 실패: {}{}\n[옵션 - 설정 - 픽시브 설정 - 로그인] 에서 설정해주세요.').format(header, url))
            return 'stop'
Esempio n. 37
0
        try:
            meta = requests.get(
                'http://localhost:5000/api/exposed/submit/meta?url=http://' +
                url).json()
        except:
            meta = {
                'aboutUsUrl': '',
                'contactUsUrl': '',
            }

        info = {
            'isReliable':
            False,
            'id':
            get_uuid(clean_url(domain)),
            'brand':
            s['name'],
            'url':
            clean_url(domain),
            'socialScore':
            get_popularity('http://' + url)['totalScore'],
            'worldRank':
            world_rank,
            'countryRank':
            country_rank,
            'aboutUsUrl':
            '' if meta['aboutUsUrl'] in ['http://#', 'https://#'
                                         ] else meta['aboutUsUrl'],
            'contactUsUrl':
            '' if meta['contactUsUrl'] in ['http://#', 'https://#'] else
 def fix_url(cls, url):
     url = clean_url(url)
     return url.split('?')[0]
Esempio n. 39
0
 def get(self, url, https=False):
     url = clean_url(url, self.base_url, https=https)
     self.driver.get(url)