Esempio n. 1
0
def extract_webpage_info(url, content):
    # Extract info from webpage

    record = {}

    # Get domain
    parsed = urlparse(url)
    record['kraken:domain'] = parsed.netloc
    record['kraken:urlPath'] = parsed.path
    record['kraken:urlPaths'] = []
    try:
        record['kraken:urlPaths'] = parsed.path.split('/')[1:]
    except:
        a = 1

    # Extract info
    extracted = extraction.Extractor().extract(content, source_url=url)

    # Get base info
    record['@type'] = 'schema:webpage'
    record['@id'] = url
    record['schema:name'] = url
    record['schema:url'] = url
    record['schema:headline'] = extracted.title
    record['schema:text'] = extracted.description
    record['schema:primaryImageOfPage'] = extracted.image
    record['kraken:feeds'] = extracted.feed
    record['kraken:tentacle'] = '1001 - Extractor'

    return record
Esempio n. 2
0
def new():
    form = NewPostitForm()
    if form.validate_on_submit():
        try:
            html = requests.get(form.url.data).text
            extracted = extraction.Extractor().extract(
                html, source_url=form.url.data)

            postit = Postit(
                extracted.url,
                extracted.title,
                extracted.description,
                extracted.image,
                form.content.data,
                flask_login.current_user,
            )
            db.session.add(postit)
            db.session.commit()

            return flask.redirect(flask.url_for("index"))
        except ValueError:
            pass

    return render_template("new.html",
                           title="Frigo | Nouveau post-it",
                           form=form)
Esempio n. 3
0
    def fetch_metadata_from_target(self):
        try:
            headers = {'User-Agent': str(ua_chrome)}
            r = requests.get(self.target_url, headers=headers)
            content = r.text
            extracted = extraction.Extractor().extract(content, source_url=self.target_url)
            changed = False
            if extracted.title:
                self.destination_title = extracted.title
                if not self.name:
                    self.name = extracted.title
                if not self.title:
                    self.title = extracted.title
                changed = True
            if extracted.description:
                self.destination_description = extracted.description
                if not self.description:
                    self.description = extracted.description
                changed = True
            if extracted.images:
                found_image = False
                now = timezone.now()
                for i in extracted.images:
                    try:
                        print(i)
                        image_request = requests.get(i)
                        source = Image.open(io.BytesIO(image_request.content))
                        thumb = ImageOps.fit(source, (400, 300), Image.ANTIALIAS, 0, (0.5, 0.5))
                        thumb_buffer = BytesIO()
                        thumb.save(thumb_buffer, format="PNG", quality=60)
                        thumb_buffer.seek(0)

                        self.thumbnail_image_source.save(
                            "%s%s-source.jpg" % (
                                self.hashid,
                                now
                            ),
                            ContentFile(image_request.content)
                        )
                        self.thumbnail_image.save(
                            "%s%s-thumb.jpg" % (
                                self.hashid,
                                now
                            ),
                            ContentFile(thumb_buffer.getvalue())
                        )
                        found_image = True
                        break
                    except:
                        import traceback
                        traceback.print_exc()
                        pass

                if found_image:
                    changed = True
            if changed:
                self.save()
        except:
            pass
Esempio n. 4
0
def get_extract(html, url):
    e = extraction.Extractor().extract(html, source_url=url)
    return {
        'title': e.title,
        'description': e.description,
        'image': e.image,
        'url': e.url if e.url else url
    }
Esempio n. 5
0
    def getMetadata(self):
        html = requests.get(self.url).text
        extracted = extraction.Extractor().extract(html, source_url=self.url)

        self.title = extracted.title

        if extracted.image:
            self.image = Image(self.title, extracted.image)
Esempio n. 6
0
    def post(self, request, *args, **kwargs):

        if request.is_ajax():

            context = dict()

            urlText = request.POST["url-search"]

            headers = {'User-Agent': 'Chrome/41.0.2228.0 Safari/537.36'}
            cookies = dict(cookies_are='working')
            session = requests.session()

            html = session.get(urlText, headers=headers, cookies=cookies).text
            extracted = extraction.Extractor().extract(html,
                                                       source_url=urlText)

            images = [img for img in extracted.images]

            parsed_uri = urlparse(urlText)
            domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)

            if "authors" in extracted._unexpected_values:
                context["author"] = extracted._unexpected_values["authors"][0]
            elif "author" in extracted._unexpected_values:
                context['author'] = extracted._unexpected_values["author"]

            filimage = getImages(html)

            image_groupA = filter(lambda pic: pic.startswith('data') == False,
                                  filimage)
            image_groupB = filter(lambda pic: pic.startswith('data') == False,
                                  extracted.images)

            #combine lists
            filteredImages = image_groupA + image_groupB

            #check duplicate
            cleanedImages = list(filteredImages)
            if extracted.image not in cleanedImages:
                cleanedImages.insert(0, extracted.image)
            else:
                cleanedImages.remove(extracted.image)
                cleanedImages.insert(1, extracted.image)

            context["images"] = cleanedImages
            context["imagesthumb"] = cleanedImages
            context["image"] = extracted.image
            context["title"] = extracted.title
            context["description"] = extracted.description
            context["domain"] = domain
            context["url"] = urlText

            html = render(request, self.template, context)
            return html
        else:
            return HttpResponse(status=400)
Esempio n. 7
0
 def fetch(self):
     html = requests.get(self.url).text
     if html:
         extracted = extraction.Extractor().extract(html,
                                                    source_url=self.url)
         self.title = extracted.title
         self.description = extracted.description
         self.updated_at = datetime.datetime.now
         return True
     return False
Esempio n. 8
0
def extract_webpage_links(url, content):

    # Extract info
    extracted = extraction.Extractor().extract(content, source_url=url)

    records = []

    for link in extracted.urls:
        record = {}
        record['@type'] = 'schema:webpage'
        record['schema:url'] = link
        record['kraken:tentacle'] = '1002 - Extractor'
        records.append(record)

    return records
Esempio n. 9
0
    def get(self, id):

        # Append imageid to url to get image
        imageurl = url + str(id)

        # Getting html text from the url
        html = requests.get(imageurl).text

        # Using extractor to get the title and image size
        extracted = extraction.Extractor().extract(html, source_url=imageurl)

        # setting the ssl context to make the gte request
        gcontext = ssl.SSLContext()

        # Create request object
        response = urlopen(extracted.image, context=gcontext)

        # Getting the reponse code from the request
        responseCode = response.getcode()

        # Getting the image response in bytes
        responseBytes = response.read()

        # Getting an Image instance object from the extracted Image bytes data
        img = Image.open(BytesIO(responseBytes))

        # splicing the image name from the image url
        # and assigning it to the img filename property
        img.filename = extracted.image[25:]

        print(extracted.titles)
        # Constructing the json response to be served
        res = jsonify({
            "message": "success",
            "data": {
                "title": extracted.titles[1],
                "filename": img.filename,
                "size": {
                    "bytes": str(len(img.fp.read()))
                },
                "dimensions": {
                    "width": img.size[0],
                    "height": img.size[1]
                }
            }
        })
        res.status_code = responseCode
        return res
Esempio n. 10
0
def extract_webpage_feeds(url, content):

    # Extract info
    extracted = extraction.Extractor().extract(content, source_url=url)

    records = []

    for feed in extracted.feeds:
        record = {}
        record['@type'] = 'schema:image'
        record['schema:url'] = feed
        record['kraken:tentacle'] = '1004 - Extractor'

        records.append(record)

    return records
Esempio n. 11
0
def get_title(article):
    if article.title in ['', '-', None]:
        # '':cbc, '-':townhall
        html = requests.get(article.url).text
        extracted_title = extraction.Extractor().extract(
            html, source_url=article.url).title

        if extracted_title in ['', '-', None]:
            if article.description == '':
                return article.pub
            else:
                return article.description

        else:
            return extracted_title

    else:
        return article.title
Esempio n. 12
0
 def setUp(self):
     self.extractor = extraction.Extractor()
Esempio n. 13
0
def getURL2(proxyHost, requestType):
    hostNo = proxyHost.split(":")[0]
    portNo = proxyHost.split(":")[1]

    global statusCode
    try:
        headers = {
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
        }

        hostNo = proxyHost.split(":")[0]
        portNo = proxyHost.split(":")[1]

        session = requesocks.session()
        session.timeout = timeoutTime
        if urlType == "https":
            urlPosition = urlList[0]
            session.proxies = {'https': 'https://' + hostNo + ':' + portNo}
        if urlType == "http":
            urlPosition = urlList[1]
            session.proxies = {'http': 'http://' + hostNo + ':' + portNo}
        url = urlPosition[0]
        urlTitle = urlPosition[1]

        if requestType == "head":
            r = session.get(url)
            extracted = extraction.Extractor().extract(r.text, source_url=url)
            if urlTitle not in extracted.title:
                statusCode = "503"
            else:
                statusCode = "200"
        elif requestType == "get":
            r = session.head(url)
            statusCode = str(r.status_code)

        result2 = proxyHost + "\t" + urlType + "\t" + statusCode

        if statusCode != "200":
            if skipSocks == False:
                result1 = testSocks4(proxyHost, urlType)
                if "503" in str(result1):
                    result = testSocks5(proxyHost, urlType)
                    return result2 + "\n" + results1 + "\n" + result
                else:
                    return result1
        else:
            return proxyHost + "\t" + urlType + "\t" + statusCode

    except requests.exceptions.ConnectionError as e:
        return proxyHost + "\t" + urlType + "\t503"

    except Exception as e:
        result2 = proxyHost + "\t" + urlType + "\t503"
        result1 = testSocks4(proxyHost, urlType)
        if "503" in str(result1):
            if skipSocks == False:
                result = testSocks5(proxyHost, urlType)
                return result2 + "\n" + result1 + "\n" + result
            else:
                if options.v:
                    if result1 != None:
                        if optionSilent == False:
                            print result1
                return result1
Esempio n. 14
0
def extract(url):
    html = requests.get(url).text
    extracted = extraction.Extractor().extract(html, source_url=url)
    print(extracted)
    return extracted
Esempio n. 15
0
 temp = line.split(',')
 url = temp[0].strip(' \n')
 tweet = ' '.join(temp[1:])
 if url.strip()[-3:] == 'pdf':
     continue
 try:
     html = requests.get(url,
                         headers={
                             'User-Agent': 'Mozilla/5.0'
                         },
                         timeout=6).text
 except:
     print "------------------PROBLEM---------------------"
     continue
 try:
     extracted = extraction.Extractor().extract(html, source_url=url)
 except:
     pass
 try:
     title = extracted.title
 except:
     title = '+++'
 try:
     desc = extracted.description
 except:
     desc = '+++'
 try:
     lastmod = str(urlopen(url).info().getdate('date'))
     lastmod = lastmod.replace(',', ':')
 except:
     lastmod = '+++'
Esempio n. 16
0
app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///database.db'
app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
app.config['USER_ENABLE_EMAIL'] = True
app.secret_key = 'secretkeyisunique_1'

app.config['MAIL_SERVER'] = 'smtp.googlemail.com'
app.config['MAIL_PORT'] = 465
app.config['MAIL_USE_SSL'] = True
app.config['MAIL_USERNAME'] = '******'
# app.config['MAIL_PASSWORD'] = os.getenv('MAIL_PASSWORD')

db = SQLAlchemy(app)
mail = Mail(app)

ext = extraction.Extractor()

# -------------------------------------------

# Objects


class User(db.Model):
    id = db.Column(db.Integer, primary_key=True)
    username = db.Column(db.String(80), unique=True, nullable=False)
    email = db.Column(db.String(120), unique=True, nullable=False)
    passwordhash = db.Column(db.String(120), nullable=False)


# Categories of websites
class Category(db.Model):
Esempio n. 17
0
def update_report():
    #     logger.info(str('updated report task called.'))

    myclient = MongoClient()
    nexus = myclient["nexus"]
    tweets = nexus["tweets"]
    users = nexus["users"]
    reports = nexus["reports"]

    users = nexus["users"].find({})
    queries = []
    for user in users:
        print(user['sub'])
        queries.append({'$match': {"user.sub": user['sub']}})
    reps = nexus.reports.aggregate(queries)

    hashtags_list = []
    for r in reps:
        hashtags_list.append(r['hashtags'])

    unique_data = [
        list(x) for x in set(tuple(sorted(x)) for x in hashtags_list)
    ]

    print('unique hashtag list:', unique_data)

    for idx, hashtags in enumerate(unique_data):
        print('fetching report for:', unique_data[idx])
        queries = []
        for hashtag in hashtags:
            if hashtag[0] == '#':
                hashtag = hashtag[1:]
                queries.append({"entities.hashtags.text": hashtag})
        if len(queries) == 0:
            continue
        query = [{'$match': {'$or': queries}}]
        tweets = nexus.tweets.aggregate(query)

        tweet_list = []
        for tweet in tweets:
            tweet_list.append(tweet)

        tweets_df = pd.DataFrame(tweet_list)

        all_hashtags = []
        for e in tweets_df.entities:
            hashtags = [t['text'] for t in e['hashtags']]
            all_hashtags = all_hashtags + hashtags

        from collections import Counter
        c = Counter(all_hashtags)
        most_common_tuples = c.most_common()
        sorted_keys = sorted(c, key=c.get, reverse=True)

        hashtag_dict = {}
        for t in most_common_tuples:
            hashtag_dict[t[0]] = t[1]

        hashtag_wordclouds = []
        for t in most_common_tuples:
            hashtag_wordclouds.append({'text': t[0], 'value': t[1]})

        for t in most_common_tuples:
            tweets_df['#' + t[0]] = False

        for i, e in enumerate(tweets_df.entities):
            hashtags = [t['text'] for t in e['hashtags']]
            for hashtag in hashtags:
                tweets_df.at[i, '#' + hashtag] = True

        def apply_func(x):
            if not isinstance(x, float):
                if 'full_text' in x:
                    return x['full_text']
                else:
                    return float('nan')
            else:
                return float('nan')

        tweets_df['full_text'] = tweets_df['extended_tweet'].apply(
            lambda x: apply_func(x))
        tweets_df.full_text.fillna(tweets_df.text, inplace=True)

        import re
        from collections import Counter
        import requests

        import extraction
        import requests

        import favicon

        url_list = []
        for text in tweets_df['text'].values.tolist():
            urls = re.findall(
                'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
                text)
            url_list += urls

        url_list = list(filter(lambda url: len(url) > 13, url_list))

        c = Counter(url_list)
        most_common_urls = c.most_common()
        sorted_urls = sorted(c, key=c.get, reverse=True)

        news_articles = []
        twitter_domain_url = 'https://twitter.com'

        url_list = []
        for e in tweets_df.entities:
            for url_obj in e['urls']:
                if url_obj['expanded_url'][:19] != 'https://twitter.com':
                    url_list.append(url_obj['expanded_url'])

        c = Counter(url_list)
        most_common_urls = c.most_common()
        sorted_urls = sorted(c, key=c.get, reverse=True)

        for i, url in enumerate(sorted_urls):
            print('count value:', i)
            if i == 20:
                break
            try:
                news_article_dict = {}

                html = requests.get(url).text
                extracted = extraction.Extractor().extract(html,
                                                           source_url=url)
                icon_url = favicon.get(url)[0][0]

                if url[:19] != twitter_domain_url:
                    #                     print('title:', extracted.title)
                    #                     print('description:', extracted.description)
                    #                     print(extracted.image, url, icon_url)

                    news_article_dict['title'] = extracted.title
                    news_article_dict['description'] = extracted.description
                    news_article_dict['favicon'] = icon_url
                    news_article_dict['image'] = extracted.image
                    news_article_dict['url'] = url
                    news_article_dict['share_count'] = most_common_urls[i][1]

                    news_articles.append(news_article_dict)

            except:
                print(url)

        # Accesing twitter from the App created in my account
        def autorize_twitter_api():
            """
            This function gets the consumer key, consumer secret key, access token
            and access token secret given by the app created in your Twitter account
            and authenticate them with Tweepy.
            """
            # Get access and costumer key and tokens
            auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
            auth.set_access_token(access_token, access_token_secret)

            return auth

        api = tweepy.API(wait_on_rate_limit_notify=True,
                         wait_on_rate_limit=True,
                         auth_handler=autorize_twitter_api())

        retweeted_ids = []
        for rt in tweets_df['retweeted_status'].values:
            if not isinstance(rt, float):
                retweeted_ids.append(rt['id'])

        c = Counter(retweeted_ids)
        most_common_tuples = c.most_common()
        sorted_keys = sorted(c, key=c.get, reverse=True)
        most_common_tuples

        ids = []
        for t in most_common_tuples:
            ids.append(t[0])
        ids = ids[:20]

        statuses = api.statuses_lookup(ids)

        statuses_list = []
        for status in statuses:
            t_obj = {
                'text':
                status._json['text'],
                'id':
                status._json['id'],
                'tweet_link':
                'https://twitter.com/i/web/status/' + status._json['id_str'],
                'user_screen_name':
                status._json['user']['screen_name'],
                'json':
                status._json
            }

            t = (t_obj, status._json['retweet_count'])
            statuses_list.append(t)

        s = sorted(statuses_list, key=lambda x: x[1])
        #         s.reverse()

        viral_tweets = [x[0] for x in s]

        # Create a second dataframe to put important information
        tweets_final = pd.DataFrame(columns=[
            "created_at", "id", "in_reply_to_screen_name",
            "in_reply_to_status_id", "in_reply_to_user_id", "retweeted_id",
            "retweeted_screen_name", "user_mentions_screen_name",
            "user_mentions_id", "text", "user_id", "screen_name",
            "followers_count"
        ])

        # Columns that are going to be the same
        equal_columns = ["created_at", "id", "text"]
        tweets_final[equal_columns] = tweets_df[equal_columns]

        # Get the basic information about user
        def get_basics(tweets_final):
            print(tweets_df["user"])
            tweets_final["screen_name"] = tweets_df["user"].apply(
                lambda x: x["screen_name"])
            tweets_final["user_id"] = tweets_df["user"].apply(
                lambda x: x["id"])
            tweets_final["followers_count"] = tweets_df["user"].apply(
                lambda x: x["followers_count"])
            return tweets_final

        # Get the user mentions
        def get_usermentions(tweets_final):
            # Inside the tag 'entities' will find 'user mentions' and will get 'screen name' and 'id'
            tweets_final["user_mentions_screen_name"] = tweets_df[
                "entities"].apply(lambda x: x["user_mentions"][0][
                    "screen_name"] if x["user_mentions"] else np.nan)
            tweets_final["user_mentions_id"] = tweets_df["entities"].apply(
                lambda x: x["user_mentions"][0]["id_str"]
                if x["user_mentions"] else np.nan)
            return tweets_final

        # Get retweets
        def get_retweets(tweets_final):
            # Inside the tag 'retweeted_status' will find 'user' and will get 'screen name' and 'id'
            tweets_final["retweeted_screen_name"] = tweets_df[
                "retweeted_status"].apply(lambda x: x["user"]["screen_name"]
                                          if x is not np.nan else np.nan)
            tweets_final["retweeted_id"] = tweets_df["retweeted_status"].apply(
                lambda x: x["user"]["id_str"] if x is not np.nan else np.nan)
            return tweets_final

        # Get the information about replies
        def get_in_reply(tweets_final):
            # Just copy the 'in_reply' columns to the new dataframe
            tweets_final["in_reply_to_screen_name"] = tweets_df[
                "in_reply_to_screen_name"]
            tweets_final["in_reply_to_status_id"] = tweets_df[
                "in_reply_to_status_id"]
            tweets_final["in_reply_to_user_id"] = tweets_df[
                "in_reply_to_user_id"]
            return tweets_final

        # Lastly fill the new dataframe with the important information
        def fill_df(tweets_final):
            get_basics(tweets_final)
            get_usermentions(tweets_final)
            get_retweets(tweets_final)
            get_in_reply(tweets_final)
            return tweets_final

        # Get the interactions between the different users
        def get_interactions(row):
            # From every row of the original dataframe
            # First we obtain the 'user_id' and 'screen_name'
            user = row["user_id"], row["screen_name"]
            # Be careful if there is no user id
            if user[0] is None:
                return (None, None), []

            # The interactions are going to be a set of tuples
            interactions = set()

            # Add all interactions
            # First, we add the interactions corresponding to replies adding the id and screen_name
            interactions.add(
                (row["in_reply_to_user_id"], row["in_reply_to_screen_name"]))
            # After that, we add the interactions with retweets
            interactions.add(
                (row["retweeted_id"], row["retweeted_screen_name"]))
            # And later, the interactions with user mentions
            interactions.add(
                (row["user_mentions_id"], row["user_mentions_screen_name"]))

            # Discard if user id is in interactions
            interactions.discard((row["user_id"], row["screen_name"]))
            # Discard all not existing values
            interactions.discard((None, None))
            # Return user and interactions
            return user, interactions

        tweets_final = fill_df(tweets_final)

        tweets_final = tweets_final.where((pd.notnull(tweets_final)), None)

        graph = nx.Graph()

        for index, tweet in tweets_final.iterrows():
            user, interactions = get_interactions(tweet)
            user_id, user_name = user
            tweet_id = int(tweet["id"])
            # tweet_sent = tweet["sentiment"]
            for interaction in interactions:
                int_id, int_name = interaction
                graph.add_edge(user_id, int_id, tweet_id=tweet_id)

                graph.node[user_id]["name"] = user_name
                graph.node[user_id]["text"] = tweet['text']
                graph.node[int_id]["name"] = int_name
                graph.node[int_id]["text"] = tweet['text']

        degrees = [val for (node, val) in graph.degree()]

        largest_subgraph = max(nx.connected_component_subgraphs(graph),
                               key=len)

        graph_centrality = nx.degree_centrality(largest_subgraph)

        max_de = max(graph_centrality.items(), key=itemgetter(1))

        graph_closeness = nx.closeness_centrality(largest_subgraph)

        max_clo = max(graph_closeness.items(), key=itemgetter(1))

        graph_betweenness = nx.betweenness_centrality(largest_subgraph,
                                                      normalized=True,
                                                      endpoints=False)

        max_bet = max(graph_betweenness.items(), key=itemgetter(1))

        all_bet = sorted(graph_betweenness.items(), key=itemgetter(1))

        all_bet.reverse()

        all_de = sorted(graph_centrality.items(), key=itemgetter(1))
        all_de.reverse()

        ids = []
        for de in all_de:
            #     print(graph.node[de[0]])
            ids.append(de[0])
        #     print(de[0])
        #     print(graph.node[de[0]]['name'])

        ids = ids[:10]

        users = api.lookup_users(user_ids=ids)

        user_list = []
        for user in users:
            user_list.append(user._json)

        ##############################################
        influencers = user_list
        ##############################################

        #         print(hashtag_wordclouds)
        #         print(news_articles)
        #         print(viral_tweets)
        #         print(influencers)

        hashtags = unique_data[idx]

        hashtags_len = len(hashtags)
        #         print('hashtags:', hashtags)
        #         hashtags = ['#'+hashtag for hashtag in hashtags]

        rprts = nexus.reports.find(
            {"hashtags": {
                "$size": hashtags_len,
                "$all": hashtags
            }})

        for r in rprts:
            report_id = r['id']
            print('report_id:', r['id'])
            print('report:', r)

            r['hashtag_wordclouds'] = hashtag_wordclouds
            r['news_articles'] = news_articles
            r['viral_tweets'] = viral_tweets
            r['influencers'] = influencers

            print('updating reports...')
            nexus.reports.update({'id': report_id}, r, upsert=True)
Esempio n. 18
0
def insert_notebook(url, screenshot=True, nb=None):
    """
    Returns
    -------
    dict {'success': True/False}
    """
    # TODO: do ajax-based async
    from web.models import Notebook

    # sanitize url
    url = url.replace('https', 'http')

    is_nbviewer = False
    try:
        url = unshorten_url(url)
        r = requests.get(url)
        if 'text/html' in r.headers['content-type']:
            # check that it's a notebook
            tmp_html = urlopen(url)
            is_nbviewer = ("Notebook on nbviewer" in tmp_html)
        if is_nbviewer:
            html_url = url
        else:
            html_url = urlparse.urljoin('http://nbviewer.ipython.org',
                                        transform_ipynb_uri(url))
        print('Downloading %s' % html_url)
        html = urlopen(html_url)
    except (urllib2.HTTPError, urllib2.URLError, socket.timeout, ssl.SSLError,
            requests.exceptions.SSLError,
            requests.sessions.InvalidSchema) as e:
        if nb is not None:
            nb.failures_access += 1
        print('Failed in downloading', e)
        return {'status': 'failure', 'reason': 'Failed accessing the notebook'}

    extracted = extraction.Extractor().extract(html, source_url=html_url)
    if len(extracted.titles) > 1:
        title = extracted.titles[1]
    else:
        title = extracted.descriptions[1]
    words_title = title.split(' ')
    if len(words_title) > 20:
        title = ' '.join(words_title[:20]) + ' ...'
    if len(extracted.descriptions) > 1:
        description = extracted.descriptions[1]
    else:
        description = ''
    words_description = description.split(' ')
    if len(words_description) > 40:
        description = ' '.join(words_description[:40]) + ' ...'

    # some more sanitation
    if title.startswith('This web site does not host'):
        # this is the nbviewer default title
        title = 'No title'
    title = title.strip(u'¶')

    #similar = Notebook.objects.filter(title=title, description=description)
    #if len(Notebook.objects.filter(title=title, description=description)) > 0:
    #return {'status': 'failure', 'reason': 'duplicate document', 'pk': similar[0].pk}

    if nb is None:
        obj, created = Notebook.objects.get_or_create(url=url)
    else:
        obj = nb
        created = False
    # screenshot
    if screenshot:
        out = make_screenshots(html_url, obj.pk)
        if out['status'] == 'failure':
            if created:
                obj.delete()
            else:
                obj.failures_access += 1
            return out
        else:
            obj.thumb_img = out['thumb']

    # XXX remove assert with error messages
    assert len(title) < 500
    obj.title = title
    assert len(description) < 2000
    obj.description = description
    assert len(html_url) < 1000
    obj.html_url = html_url
    assert len(url) < 1000
    obj.url = url
    obj.full_html = html

    obj.last_accessed_date = datetime.now().date()
    obj.save()
    return {'status': 'success', 'pk': obj.pk, 'created': created}
Esempio n. 19
0
    with open(fname, 'r') as file:
        for line in file:
            urls.append(line[:-1])
    print(f"Exracted {len(urls)} URLs from '{fname}'")

    savefile = False
    if outname is not None:
        file = open(outname, 'w')
        savefile = True

    for i, url in enumerate(urls):
        try:
            print(f"Fetching {i+1} of {len(urls)}: {url} ...")
            r = requests.get(url)
            html_data = r.text
            extracted = extraction.Extractor().extract(html_data)

            title, desc, link = [extracted.title,
                                 extracted.description, extracted.url]
            title = title + "\n" if title is not None else ""
            desc = desc + "\n" if desc is not None else ""
            link = link + f"\n[{url}]\n" if link is not None else f"[{url}]\n"

            print(title)
            if savefile:
                file.write(title + desc + link + "\n")
            else:
                print(desc)
                print(link)
                print("\n")
        except RequestException as e:
Esempio n. 20
0
def getURL1(proxyHost, requestType, urlType):
    global timeoutTime
    #print "Testing socks proxy: http://"+proxyHost
    import socks
    import socket
    import urllib2
    hostNo = proxyHost.split(":")[0]
    portNo = proxyHost.split(":")[1]

    #urlList = []
    #urlList.append(["https://www.tracemyip.org/","Trace My IP"])
    #urlList.append(["https://www.wikipedia.org/","Wikipedia"])
    #urlList.append(["http://whatismyipaddress.com/","What Is My IP Address?"])

    global statusCode
    try:
        #print "Testing http proxy: http://"+proxyHost
        headers = {
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
        }

        hostNo = proxyHost.split(":")[0]
        portNo = proxyHost.split(":")[1]

        session = requesocks.session()
        session.timeout = timeoutTime
        if urlType == "https":
            urlPosition = urlList[0]
            session.proxies = {'https': 'https://' + hostNo + ':' + portNo}
        if urlType == "http":
            urlPosition = urlList[1]
            session.proxies = {'http': 'http://' + hostNo + ':' + portNo}
        url = urlPosition[0]
        urlTitle = urlPosition[1]

        if requestType == "get":
            r = session.get(url)
            #try:
            extracted = extraction.Extractor().extract(r.text, source_url=url)
            if urlTitle not in extracted.title:
                return proxyHost + "\t" + urlType + "\t503"
        elif requestType == "head":
            r = session.head(url)
        statusCode = str(r.status_code)

        result2 = proxyHost + "\t" + urlType + "\t" + statusCode

        if statusCode != "200":
            result1 = testSocks4(proxyHost, urlType)
            if "503" in str(result1):
                #if options.v:
                #	print result
                result = testSocks5(proxyHost, urlType)
                return result2 + "\n" + results1 + "\n" + result
            else:
                return result1
        else:
            return proxyHost + "\t" + urlType + "\t" + statusCode

    except requests.exceptions.ConnectionError as e:
        return proxyHost + "\t" + urlType + "\t503"

    except Exception as e:
        result2 = proxyHost + "\t" + urlType + "\t503"

        #if options.v:
        #	print proxyHost+"\t"+urlType+"\t503"
        result1 = testSocks4(proxyHost, urlType)
        if "503" in str(result1):
            #if options.v:
            #	print result
            result = testSocks5(proxyHost, urlType)
            #if options.v:
            #	print result
            return result2 + "\n" + result1 + "\n" + result
        else:
            if options.v:
                print result1
            return result1
Esempio n. 21
0
def extractd(url):
    print(type(url))
    html = requests.get(url['url']).text
    extracted = extraction.Extractor().extract(html, source_url=url['url'])
    return extracted
    def post(self, site):
        linkID = request.args.get('id', None)

        title = ''
        description = ''

        if 'title' in request.form:
            title = request.form['title']
        if 'description' in request.form:
            description = request.form['description']
        url = request.form['url']
        print('here')
        public = False
        if request.form.get('public') != None:
            public = True

        session = Session()

        link = session.query(Link).filter_by(site=site).filter_by(
            id=linkID).first()
        if link is None:
            link = Link(site=site)
        else:
            link.title = title
            link.description = description

        if link.url != url:
            # Do content generation
            link.url = url

            # Fetch website html content
            urlsession = requests.Session()
            retry = Retry(connect=3, backoff_factor=2.0)
            adapter = HTTPAdapter(max_retries=retry)
            urlsession.mount('http://', adapter)
            urlsession.mount('https://', adapter)
            html = urlsession.get(link.url).text

            # Extract title, description, and preview image from meta-tags
            extracted = extraction.Extractor().extract(html, source_url=url)

            title = extracted.title or ''
            descrip = extracted.description or ''
            imgurl = extracted.image or ''

            # Fetch image data data and resize for s3
            img_data = requests.get(imgurl).content
            if img_data is None:
                return url
            img = Image.open(io.BytesIO(img_data))
            width, height = img.size

            filename = secure_filename(alphaNumericID())

            imgSize = (1000, 1000)
            originalImage = resizeIOImage(img_data, (width, height))
            resizedImage = resizeIOImage(img_data, imgSize)

            # Upload to s3
            uploadImage(originalImage, "%soriginal" % filename)
            uploadImage(resizedImage, filename)

            original_url = "{}{}original.jpeg".format(S3_LOCATION, filename)
            large_url = "{}{}.jpeg".format(S3_LOCATION, filename)

            # Save website content
            link.title = title
            link.description = descrip
            link.source_url = original_url
            link.large_url = large_url

        link.public = public

        session.add(link)
        session.commit()
        linkID = link.id
        session.close()

        track_activity('Updated news link', linkID, 'link', site)

        return redirect(url_for('Links_view', id=linkID, site=site))