Beispiel #1
0
 def test_cacheing(self):
     self.assertEqual(
         self.resolved_urls,
         urlexpander.expand(self.urls, cache_file='__cache.json'))
     self.assertEqual(
         self.resolved_urls,
         urlexpander.expand(self.urls, cache_file='__cache.json'))
     os.remove('__cache.json')
Beispiel #2
0
def link_report(df):
    tweet_df = df['Tweet']
    all_tweet_links = ""
    for tweet in tweet_df:
        all_tweet_links += tweet + " "

    all_urls = re.findall(
        'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
        all_tweet_links)

    print("*" * 50)
    print("Total number of links : " + str(len(all_urls)))
    print("*" * 50)

    links = {'Links': all_urls}

    link_df = pd.DataFrame(links, columns=['Links'])

    domain_count = {}
    for link in link_df['Links']:
        url = uex.expand(link)
        domain = uex.get_domain(url)
        # print(domain)
        if domain in domain_count:
            domain_count[domain] += 1
        else:
            domain_count[domain] = 1

    # print(domain_count)

    domain_df = pd.DataFrame.from_dict(domain_count,
                                       orient='index',
                                       columns=['Count'])

    print(domain_df.sort_values(by=['Count'], ascending=False).to_string())
Beispiel #3
0
def create_reddit_actions(es, lines_json, tmp_filename, calc_embeddings=False, expand_urls=False):
    urls_dict = {}
    all_urls = []
    actions = []

    for post in lines_json:
        try:
            urls = extractor.find_urls(str(post['body']))
        except AttributeError:
            post['smapp_urls'] = []
            continue
        post['smapp_urls'] = urls
        all_urls.extend(urls)
    all_urls = [url for url in all_urls if 'reddit.com' not in url]
    if expand_urls:
        expanded_urls = urlexpander.expand(all_urls,
                                           chunksize=1280,
                                           n_workers=64,
                                           cache_file=tmp_filename)
        urls_dict = dict(zip(all_urls, expanded_urls))
    for post_num, post in enumerate(lines_json):
        post = preprocess_reddit_post(post, calc_embeddings, urls_dict)
        period = str(pd.to_datetime(
            post['created_utc'], unit='s').to_period('M'))
        index_name = f'smapp_reddit_{period}'
        action = {
            "_index": index_name,
            "_type": '_doc',
            "_id": str(post['id']),
            "_source": post,
        }
        actions.append(action)

    return actions
Beispiel #4
0
 def _parse_signup_url(self, url: str) -> Tuple[ParseResult, dict]:
     expanded_url = self._url_cache.get(url, urlexpander.expand(url))
     if "list-manage.com/subscribe" not in expanded_url:
         raise ValueError(
             "It doesn't look like you gave us a MailChimp URL form")
     ps = urlparse(expanded_url)
     ps = ps._replace(path=f"{ps.path}/post")
     qs = query_string.parse(ps.query)
     return ps, qs
Beispiel #5
0
def yesorno(request):
    thetesturl = request.GET.get("url")
    if not validators.url(thetesturl):
        response = {"status": False, "data": {"error": "not a url"}}
        return JsonResponse(response,
                            json_dumps_params={'indent': 2},
                            status=200)
    if "latLmes" in thetesturl:
        response = {
            "status": True,
            "data": {
                "url": "latLmes.com",
                "rickroll": True
            }
        }
        return JsonResponse(response, json_dumps_params={'indent': 2})

    testurl = urlexpander.expand(thetesturl).replace("www.", "")
    query = urlparse(testurl)
    path = query.path
    if query.hostname == "rickroll-links-database.ch1ck3n.repl.co":
        response = {
            "status": True,
            "data": {
                "url": "repl.co",
                "rickroll": False
            }
        }
        return JsonResponse(response, json_dumps_params={'indent': 2})
    if query.hostname == "theraleighregister.com":
        response = {
            "status": True,
            "data": {
                "url": "theraleighregister.com",
                "rickroll": "true"
            }
        }
        return JsonResponse(response, json_dumps_params={'indent': 2})
    if path == "/":
        path == ""
    if query.query:
        print(query.query)
        path = path + "?" + query.query
    hostname = str(query.hostname)
    if hostname == "None":
        hostname = ""
    response = {
        "status": True,
        "data": {
            "url": str(hostname) + str(path),
            "rickroll": str(str(hostname) + str(path) in links)
        }
    }
    return JsonResponse(response, json_dumps_params={'indent': 2})
def unpack_google_url():
    try:
        short_url = request.args.get('short_url')
        response_url = urlexpander.expand(short_url)
        pattern = '@(\-?\d+.\d+),(\-?\d+.\d+)'
        res = re.search(pattern, response_url)
        if res is not None:
            result = {'lat': res.group(1), 'lng': res.group(2), 'status': 'OK'}
        else:
            result = {'status': 'unable to parse', 'url': response_url}
        return json.dumps(result)
    except Exception as e:
        result = {'status': 'error', 'description': str(e)}
        return json.dumps(result)
Beispiel #7
0
def unshorturls(proxy: Cut, batch=False):
    """Resolves shortened urls
    
    Arguments:
        proxy {Cut} -- Twitter status object (dict) under scalpl access

    
    Keyword Arguments:
        batch {bool} -- [description] (default: {False})
    
    Returns:
        [list] -- List of tuples with (shortened url, unshortened url, domain)
    """

    KEY_URLS = ['urls','media','quoted_status.media','quoted_status.urls','retweeted_status.media','retweeted_status.urls']
    shortened_urls = []

    for k in KEY_URLS:
        if k in proxy:
            for i in range(0,len(proxy[k])):
                cur_k_urls_expanded =  k + "[%d].expanded_url" % i
                if cur_k_urls_expanded in proxy:
                    c_url_expanded = proxy[cur_k_urls_expanded]
                    not_resolved = False
                    #if urlexpander.is_short(c_url_expanded) or "lajunta.es" in c_url_expanded:
                    if batch == False:
                        try:
                            proxy[cur_k_urls_expanded] = urlexpander.expand(c_url_expanded, filter_function=__custom_filter)
                        except:
                            not_resolved = True
                            logger.warning("I can't expand: %s" % proxy[cur_k_urls_expanded])
                    
                    if batch == False and not_resolved == False:
                        # Obtain a new key with only domain 
                        parsed_uri = urlparse(proxy[cur_k_urls_expanded])
                        domain = '{uri.netloc}'.format(uri=parsed_uri)
                        cur_k_urls_expanded_domain = cur_k_urls_expanded.replace('expanded_url','expanded_domain')
                        proxy[cur_k_urls_expanded_domain] = domain
                        shortened_urls += [(c_url_expanded,  proxy[cur_k_urls_expanded], domain)]
                    else:
                        shortened_urls += [(c_url_expanded,  None, None)]
    
    return shortened_urls
Beispiel #8
0
 def test_expand_many(self):
     self.assertEqual(self.resolved_urls, urlexpander.expand(self.urls))
Beispiel #9
0
 def test_expand_one(self):
     self.assertEqual(self.resolved_urls[0],
                      urlexpander.expand(self.urls[0]))
            break
        for tweet in new_tweets:
            tweet_list.append(tweet.full_text)
            #print(tweet.full_text)
            print(tweet.url)
        tweetCount += len(new_tweets)
        print("Found {0} tweets".format(tweetCount))
        max_id = new_tweets[-1].id
    except tweepy.TweepError as e:
           # # Just exit if any error
        print("some error : " + str(e))
        break
    
print ("Downloaded {0} tweets.".format(tweetCount))

for tweet in tweet_list:
    songUrls = re.findall(urlmarker.URL_REGEX, tweet)
    #songUrls = re.findall('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', tweet)
    for songUrl in songUrls:
        print(songUrl)
        expanded = urlexpander.expand(songUrl)
        print(expanded)
        if "spotify" in songUrl:
            songUri = getUri(songUrl)
            if songUri is not None:
                print("added a song!")
                requests.post("https://api.spotify.com/v1/playlists/34PelCJCvwUvOQIbhoN0he/tracks?uris="+songUri+" Accept: application/json"+" Content-Type: application/json"+" Authorization: Bearer "+spotifyAuth)
                print("added a song!3223123123")
        else:
            print("not a track, can't add!")
def createNotionTask(token, collectionURL, content, url):
    def convertImagePath(imagePath, mdFilePath):
        parsed_url = urllib.parse.urlparse(url)
        domain = parsed_url.scheme + '://' + parsed_url.netloc

        relative_url = os.path.abspath(
            str(pathlib.Path().absolute()) + '/images/' + imagePath)
        new_url = urllib.parse.urljoin(domain, imagePath)
        r = http.request('GET', new_url)
        img = r.data

        os.makedirs(os.path.dirname(relative_url), exist_ok=True)
        with open(relative_url, 'wb') as f:
            f.write(img)

        return Path(
            os.path.abspath(str(pathlib.Path().absolute()) + imagePath))

    if (content):
        client = NotionClient(token)
        cv = client.get_collection_view(collectionURL)
        print(cv.collection.parent.views)
        row = cv.collection.add_row()

        if ('task:' in content):
            content = content.replace('task:', '')

        if ('Task:' in content):
            content = content.replace('Task:', '')

        row.title = content

        if (url and "http://ifttt.com/missing_link" not in url):
            expanded_url = urlexpander.expand(url)
            if ('imgur' in expanded_url):
                if 'gallery/' in expanded_url:
                    gallery = expanded_url.split('gallery/')[1]

                    client = ImgurClient(client_id, client_secret)
                    items = client.get_album_images(gallery)

                    imgur_object = ""
                    for item in items:
                        img = "<img src='" + item.link + "' /><br>"
                        imgur_object += img

                    text = prettierfier.prettify_html(imgur_object)
                    doc = Document(text)
                    text = doc.summary()

                    output = pypandoc.convert_text(text,
                                                   'gfm-raw_html',
                                                   format='html')

                    if (output == ""):
                        page = row.children.add_new(BookmarkBlock)
                        page.link = url
                        page.title = content
                    else:
                        rendered = convert(output)
                        for blockDescriptor in rendered:
                            uploadBlock(blockDescriptor,
                                        row,
                                        content,
                                        imagePathFunc=convertImagePath)
            else:
                # try:
                #     row.url = url
                #
                #     http = urllib3.PoolManager()
                #     r = http.request('GET', url)
                #
                #     text = prettierfier.prettify_html(str(r.data))
                #     soup = BeautifulSoup(str(r.data))
                #     metas = soup.find_all('meta')
                #     doc = Document(text)
                #     text = doc.summary()
                #     print(metas)
                #     output = pypandoc.convert_text(text, 'gfm-raw_html', format='html')
                #     output = output.replace('\\\\n', '')
                #     output = output.replace('\\\\t', '')
                #     output = output.replace("\\\\'", "\'")
                #     print(output)
                #
                #
                #     if (output == ""):
                #         print("wtf1")
                #         raise ValueError('No website data')
                #
                #     rendered = convert(output)
                #
                #     # Upload all the blocks
                #     for blockDescriptor in rendered:
                #         uploadBlock(blockDescriptor, row, doc.title(),imagePathFunc=convertImagePath)
                # except:
                page = row.children.add_new(BookmarkBlock)
                page.link = url
                page.title = content
        else:
            row.children.add_new(TextBlock, title=content)

        # shutil.rmtree(Path(str(pathlib.Path().absolute()) + '/images/'), ignore_error=True)
        return content
def expand_urls(config):

    short_link_services = [
        'bit.ly',
        'dlvr.it',
        'liicr.nl',
        'tinyurl.com',
        'goo.gl',
        'ift.tt',
        'ow.ly',
        'fxn.ws',
        'buff.ly',
        'back.ly',
        'amzn.to',
        'nyti.ms',
        'nyp.st',
        'dailysign.al',
        'j.mp',
        'wapo.st',
        'reut.rs',
        'drudge.tw',
        'shar.es',
        'sumo.ly',
        'rebrand.ly',
        'covfefe.bz',
        'trib.al',
        'yhoo.it',
        't.co',
        'shr.lc',
        'po.st',
        'dld.bz',
        'bitly.com',
        'crfrm.us',
        'flip.it',
        'mf.tt',
        'wp.me',
        'voat.co',
        'zurl.co',
        'fw.to',
        'mol.im',
        'read.bi',
        'disq.us',
        'tmsnrt.rs',
        'usat.ly',
        'aje.io',
        'sc.mp',
        'gop.cm',
        'crwd.fr',
        'zpr.io',
        'scq.io',
        'trib.in',
        'owl.li',
        'youtu.be',
    ]

    urls_table = pd.read_csv(os.path.join(
        config["PATHS"]["INTERMEDIATE_DATA_DIR"], "tweet_url_table.csv"),
                             usecols=["tweet_id", "url"])

    urls_tweet_id = dict()
    for ix, row in urls_table.iterrows():
        url = row["url"]
        tweet_id = row["tweet_id"]
        domain = extract_top_domain(url)
        if domain in short_link_services:
            urls_tweet_id[url] = tweet_id

    print("No. urls to expand: " + str(urls_tweet_id.__len__()))

    q = queue.Queue()

    def expand_domain(short_url):
        expanded_url = infer_base_url(short_url)
        top_domain = extract_top_domain(expanded_url)
        q.put([short_url, expanded_url, top_domain])
        print("Working on {}, {} ".format(short_url, len(q.queue)))

    with concurrent.futures.ThreadPoolExecutor(max_workers=30) as executor:
        executor.map(expand_domain, list(urls_tweet_id.keys()))

    res_df = pd.DataFrame(list(q.queue),
                          columns=['short_url', 'expanded_url', 'top_domain'])

    print("Updating links")
    expanded_urls_dict = dict()
    for ix, row in res_df.iterrows():
        old = row["short_url"]
        new = row["expanded_url"]
        if old == new:  # it wasn't expanded: lets try with urlexpander
            try:
                new_v2 = urlexpander.expand(old)
                if new_v2:
                    new = new_v2
            except:
                pass
        expanded_urls_dict[old] = new
    pkl.dump(
        expanded_urls_dict,
        open(
            os.path.join(config["PATHS"]["INTERMEDIATE_DATA_DIR"],
                         "urls_expanded.pkl"), "wb"))