def extract_webpage_info(url, content): # Extract info from webpage record = {} # Get domain parsed = urlparse(url) record['kraken:domain'] = parsed.netloc record['kraken:urlPath'] = parsed.path record['kraken:urlPaths'] = [] try: record['kraken:urlPaths'] = parsed.path.split('/')[1:] except: a = 1 # Extract info extracted = extraction.Extractor().extract(content, source_url=url) # Get base info record['@type'] = 'schema:webpage' record['@id'] = url record['schema:name'] = url record['schema:url'] = url record['schema:headline'] = extracted.title record['schema:text'] = extracted.description record['schema:primaryImageOfPage'] = extracted.image record['kraken:feeds'] = extracted.feed record['kraken:tentacle'] = '1001 - Extractor' return record
def new(): form = NewPostitForm() if form.validate_on_submit(): try: html = requests.get(form.url.data).text extracted = extraction.Extractor().extract( html, source_url=form.url.data) postit = Postit( extracted.url, extracted.title, extracted.description, extracted.image, form.content.data, flask_login.current_user, ) db.session.add(postit) db.session.commit() return flask.redirect(flask.url_for("index")) except ValueError: pass return render_template("new.html", title="Frigo | Nouveau post-it", form=form)
def fetch_metadata_from_target(self): try: headers = {'User-Agent': str(ua_chrome)} r = requests.get(self.target_url, headers=headers) content = r.text extracted = extraction.Extractor().extract(content, source_url=self.target_url) changed = False if extracted.title: self.destination_title = extracted.title if not self.name: self.name = extracted.title if not self.title: self.title = extracted.title changed = True if extracted.description: self.destination_description = extracted.description if not self.description: self.description = extracted.description changed = True if extracted.images: found_image = False now = timezone.now() for i in extracted.images: try: print(i) image_request = requests.get(i) source = Image.open(io.BytesIO(image_request.content)) thumb = ImageOps.fit(source, (400, 300), Image.ANTIALIAS, 0, (0.5, 0.5)) thumb_buffer = BytesIO() thumb.save(thumb_buffer, format="PNG", quality=60) thumb_buffer.seek(0) self.thumbnail_image_source.save( "%s%s-source.jpg" % ( self.hashid, now ), ContentFile(image_request.content) ) self.thumbnail_image.save( "%s%s-thumb.jpg" % ( self.hashid, now ), ContentFile(thumb_buffer.getvalue()) ) found_image = True break except: import traceback traceback.print_exc() pass if found_image: changed = True if changed: self.save() except: pass
def get_extract(html, url): e = extraction.Extractor().extract(html, source_url=url) return { 'title': e.title, 'description': e.description, 'image': e.image, 'url': e.url if e.url else url }
def getMetadata(self): html = requests.get(self.url).text extracted = extraction.Extractor().extract(html, source_url=self.url) self.title = extracted.title if extracted.image: self.image = Image(self.title, extracted.image)
def post(self, request, *args, **kwargs): if request.is_ajax(): context = dict() urlText = request.POST["url-search"] headers = {'User-Agent': 'Chrome/41.0.2228.0 Safari/537.36'} cookies = dict(cookies_are='working') session = requests.session() html = session.get(urlText, headers=headers, cookies=cookies).text extracted = extraction.Extractor().extract(html, source_url=urlText) images = [img for img in extracted.images] parsed_uri = urlparse(urlText) domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri) if "authors" in extracted._unexpected_values: context["author"] = extracted._unexpected_values["authors"][0] elif "author" in extracted._unexpected_values: context['author'] = extracted._unexpected_values["author"] filimage = getImages(html) image_groupA = filter(lambda pic: pic.startswith('data') == False, filimage) image_groupB = filter(lambda pic: pic.startswith('data') == False, extracted.images) #combine lists filteredImages = image_groupA + image_groupB #check duplicate cleanedImages = list(filteredImages) if extracted.image not in cleanedImages: cleanedImages.insert(0, extracted.image) else: cleanedImages.remove(extracted.image) cleanedImages.insert(1, extracted.image) context["images"] = cleanedImages context["imagesthumb"] = cleanedImages context["image"] = extracted.image context["title"] = extracted.title context["description"] = extracted.description context["domain"] = domain context["url"] = urlText html = render(request, self.template, context) return html else: return HttpResponse(status=400)
def fetch(self): html = requests.get(self.url).text if html: extracted = extraction.Extractor().extract(html, source_url=self.url) self.title = extracted.title self.description = extracted.description self.updated_at = datetime.datetime.now return True return False
def extract_webpage_links(url, content): # Extract info extracted = extraction.Extractor().extract(content, source_url=url) records = [] for link in extracted.urls: record = {} record['@type'] = 'schema:webpage' record['schema:url'] = link record['kraken:tentacle'] = '1002 - Extractor' records.append(record) return records
def get(self, id): # Append imageid to url to get image imageurl = url + str(id) # Getting html text from the url html = requests.get(imageurl).text # Using extractor to get the title and image size extracted = extraction.Extractor().extract(html, source_url=imageurl) # setting the ssl context to make the gte request gcontext = ssl.SSLContext() # Create request object response = urlopen(extracted.image, context=gcontext) # Getting the reponse code from the request responseCode = response.getcode() # Getting the image response in bytes responseBytes = response.read() # Getting an Image instance object from the extracted Image bytes data img = Image.open(BytesIO(responseBytes)) # splicing the image name from the image url # and assigning it to the img filename property img.filename = extracted.image[25:] print(extracted.titles) # Constructing the json response to be served res = jsonify({ "message": "success", "data": { "title": extracted.titles[1], "filename": img.filename, "size": { "bytes": str(len(img.fp.read())) }, "dimensions": { "width": img.size[0], "height": img.size[1] } } }) res.status_code = responseCode return res
def extract_webpage_feeds(url, content): # Extract info extracted = extraction.Extractor().extract(content, source_url=url) records = [] for feed in extracted.feeds: record = {} record['@type'] = 'schema:image' record['schema:url'] = feed record['kraken:tentacle'] = '1004 - Extractor' records.append(record) return records
def get_title(article): if article.title in ['', '-', None]: # '':cbc, '-':townhall html = requests.get(article.url).text extracted_title = extraction.Extractor().extract( html, source_url=article.url).title if extracted_title in ['', '-', None]: if article.description == '': return article.pub else: return article.description else: return extracted_title else: return article.title
def setUp(self): self.extractor = extraction.Extractor()
def getURL2(proxyHost, requestType): hostNo = proxyHost.split(":")[0] portNo = proxyHost.split(":")[1] global statusCode try: headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36' } hostNo = proxyHost.split(":")[0] portNo = proxyHost.split(":")[1] session = requesocks.session() session.timeout = timeoutTime if urlType == "https": urlPosition = urlList[0] session.proxies = {'https': 'https://' + hostNo + ':' + portNo} if urlType == "http": urlPosition = urlList[1] session.proxies = {'http': 'http://' + hostNo + ':' + portNo} url = urlPosition[0] urlTitle = urlPosition[1] if requestType == "head": r = session.get(url) extracted = extraction.Extractor().extract(r.text, source_url=url) if urlTitle not in extracted.title: statusCode = "503" else: statusCode = "200" elif requestType == "get": r = session.head(url) statusCode = str(r.status_code) result2 = proxyHost + "\t" + urlType + "\t" + statusCode if statusCode != "200": if skipSocks == False: result1 = testSocks4(proxyHost, urlType) if "503" in str(result1): result = testSocks5(proxyHost, urlType) return result2 + "\n" + results1 + "\n" + result else: return result1 else: return proxyHost + "\t" + urlType + "\t" + statusCode except requests.exceptions.ConnectionError as e: return proxyHost + "\t" + urlType + "\t503" except Exception as e: result2 = proxyHost + "\t" + urlType + "\t503" result1 = testSocks4(proxyHost, urlType) if "503" in str(result1): if skipSocks == False: result = testSocks5(proxyHost, urlType) return result2 + "\n" + result1 + "\n" + result else: if options.v: if result1 != None: if optionSilent == False: print result1 return result1
def extract(url): html = requests.get(url).text extracted = extraction.Extractor().extract(html, source_url=url) print(extracted) return extracted
temp = line.split(',') url = temp[0].strip(' \n') tweet = ' '.join(temp[1:]) if url.strip()[-3:] == 'pdf': continue try: html = requests.get(url, headers={ 'User-Agent': 'Mozilla/5.0' }, timeout=6).text except: print "------------------PROBLEM---------------------" continue try: extracted = extraction.Extractor().extract(html, source_url=url) except: pass try: title = extracted.title except: title = '+++' try: desc = extracted.description except: desc = '+++' try: lastmod = str(urlopen(url).info().getdate('date')) lastmod = lastmod.replace(',', ':') except: lastmod = '+++'
app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///database.db' app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False app.config['USER_ENABLE_EMAIL'] = True app.secret_key = 'secretkeyisunique_1' app.config['MAIL_SERVER'] = 'smtp.googlemail.com' app.config['MAIL_PORT'] = 465 app.config['MAIL_USE_SSL'] = True app.config['MAIL_USERNAME'] = '******' # app.config['MAIL_PASSWORD'] = os.getenv('MAIL_PASSWORD') db = SQLAlchemy(app) mail = Mail(app) ext = extraction.Extractor() # ------------------------------------------- # Objects class User(db.Model): id = db.Column(db.Integer, primary_key=True) username = db.Column(db.String(80), unique=True, nullable=False) email = db.Column(db.String(120), unique=True, nullable=False) passwordhash = db.Column(db.String(120), nullable=False) # Categories of websites class Category(db.Model):
def update_report(): # logger.info(str('updated report task called.')) myclient = MongoClient() nexus = myclient["nexus"] tweets = nexus["tweets"] users = nexus["users"] reports = nexus["reports"] users = nexus["users"].find({}) queries = [] for user in users: print(user['sub']) queries.append({'$match': {"user.sub": user['sub']}}) reps = nexus.reports.aggregate(queries) hashtags_list = [] for r in reps: hashtags_list.append(r['hashtags']) unique_data = [ list(x) for x in set(tuple(sorted(x)) for x in hashtags_list) ] print('unique hashtag list:', unique_data) for idx, hashtags in enumerate(unique_data): print('fetching report for:', unique_data[idx]) queries = [] for hashtag in hashtags: if hashtag[0] == '#': hashtag = hashtag[1:] queries.append({"entities.hashtags.text": hashtag}) if len(queries) == 0: continue query = [{'$match': {'$or': queries}}] tweets = nexus.tweets.aggregate(query) tweet_list = [] for tweet in tweets: tweet_list.append(tweet) tweets_df = pd.DataFrame(tweet_list) all_hashtags = [] for e in tweets_df.entities: hashtags = [t['text'] for t in e['hashtags']] all_hashtags = all_hashtags + hashtags from collections import Counter c = Counter(all_hashtags) most_common_tuples = c.most_common() sorted_keys = sorted(c, key=c.get, reverse=True) hashtag_dict = {} for t in most_common_tuples: hashtag_dict[t[0]] = t[1] hashtag_wordclouds = [] for t in most_common_tuples: hashtag_wordclouds.append({'text': t[0], 'value': t[1]}) for t in most_common_tuples: tweets_df['#' + t[0]] = False for i, e in enumerate(tweets_df.entities): hashtags = [t['text'] for t in e['hashtags']] for hashtag in hashtags: tweets_df.at[i, '#' + hashtag] = True def apply_func(x): if not isinstance(x, float): if 'full_text' in x: return x['full_text'] else: return float('nan') else: return float('nan') tweets_df['full_text'] = tweets_df['extended_tweet'].apply( lambda x: apply_func(x)) tweets_df.full_text.fillna(tweets_df.text, inplace=True) import re from collections import Counter import requests import extraction import requests import favicon url_list = [] for text in tweets_df['text'].values.tolist(): urls = re.findall( 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text) url_list += urls url_list = list(filter(lambda url: len(url) > 13, url_list)) c = Counter(url_list) most_common_urls = c.most_common() sorted_urls = sorted(c, key=c.get, reverse=True) news_articles = [] twitter_domain_url = 'https://twitter.com' url_list = [] for e in tweets_df.entities: for url_obj in e['urls']: if url_obj['expanded_url'][:19] != 'https://twitter.com': url_list.append(url_obj['expanded_url']) c = Counter(url_list) most_common_urls = c.most_common() sorted_urls = sorted(c, key=c.get, reverse=True) for i, url in enumerate(sorted_urls): print('count value:', i) if i == 20: break try: news_article_dict = {} html = requests.get(url).text extracted = extraction.Extractor().extract(html, source_url=url) icon_url = favicon.get(url)[0][0] if url[:19] != twitter_domain_url: # print('title:', extracted.title) # print('description:', extracted.description) # print(extracted.image, url, icon_url) news_article_dict['title'] = extracted.title news_article_dict['description'] = extracted.description news_article_dict['favicon'] = icon_url news_article_dict['image'] = extracted.image news_article_dict['url'] = url news_article_dict['share_count'] = most_common_urls[i][1] news_articles.append(news_article_dict) except: print(url) # Accesing twitter from the App created in my account def autorize_twitter_api(): """ This function gets the consumer key, consumer secret key, access token and access token secret given by the app created in your Twitter account and authenticate them with Tweepy. """ # Get access and costumer key and tokens auth = tweepy.OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_token_secret) return auth api = tweepy.API(wait_on_rate_limit_notify=True, wait_on_rate_limit=True, auth_handler=autorize_twitter_api()) retweeted_ids = [] for rt in tweets_df['retweeted_status'].values: if not isinstance(rt, float): retweeted_ids.append(rt['id']) c = Counter(retweeted_ids) most_common_tuples = c.most_common() sorted_keys = sorted(c, key=c.get, reverse=True) most_common_tuples ids = [] for t in most_common_tuples: ids.append(t[0]) ids = ids[:20] statuses = api.statuses_lookup(ids) statuses_list = [] for status in statuses: t_obj = { 'text': status._json['text'], 'id': status._json['id'], 'tweet_link': 'https://twitter.com/i/web/status/' + status._json['id_str'], 'user_screen_name': status._json['user']['screen_name'], 'json': status._json } t = (t_obj, status._json['retweet_count']) statuses_list.append(t) s = sorted(statuses_list, key=lambda x: x[1]) # s.reverse() viral_tweets = [x[0] for x in s] # Create a second dataframe to put important information tweets_final = pd.DataFrame(columns=[ "created_at", "id", "in_reply_to_screen_name", "in_reply_to_status_id", "in_reply_to_user_id", "retweeted_id", "retweeted_screen_name", "user_mentions_screen_name", "user_mentions_id", "text", "user_id", "screen_name", "followers_count" ]) # Columns that are going to be the same equal_columns = ["created_at", "id", "text"] tweets_final[equal_columns] = tweets_df[equal_columns] # Get the basic information about user def get_basics(tweets_final): print(tweets_df["user"]) tweets_final["screen_name"] = tweets_df["user"].apply( lambda x: x["screen_name"]) tweets_final["user_id"] = tweets_df["user"].apply( lambda x: x["id"]) tweets_final["followers_count"] = tweets_df["user"].apply( lambda x: x["followers_count"]) return tweets_final # Get the user mentions def get_usermentions(tweets_final): # Inside the tag 'entities' will find 'user mentions' and will get 'screen name' and 'id' tweets_final["user_mentions_screen_name"] = tweets_df[ "entities"].apply(lambda x: x["user_mentions"][0][ "screen_name"] if x["user_mentions"] else np.nan) tweets_final["user_mentions_id"] = tweets_df["entities"].apply( lambda x: x["user_mentions"][0]["id_str"] if x["user_mentions"] else np.nan) return tweets_final # Get retweets def get_retweets(tweets_final): # Inside the tag 'retweeted_status' will find 'user' and will get 'screen name' and 'id' tweets_final["retweeted_screen_name"] = tweets_df[ "retweeted_status"].apply(lambda x: x["user"]["screen_name"] if x is not np.nan else np.nan) tweets_final["retweeted_id"] = tweets_df["retweeted_status"].apply( lambda x: x["user"]["id_str"] if x is not np.nan else np.nan) return tweets_final # Get the information about replies def get_in_reply(tweets_final): # Just copy the 'in_reply' columns to the new dataframe tweets_final["in_reply_to_screen_name"] = tweets_df[ "in_reply_to_screen_name"] tweets_final["in_reply_to_status_id"] = tweets_df[ "in_reply_to_status_id"] tweets_final["in_reply_to_user_id"] = tweets_df[ "in_reply_to_user_id"] return tweets_final # Lastly fill the new dataframe with the important information def fill_df(tweets_final): get_basics(tweets_final) get_usermentions(tweets_final) get_retweets(tweets_final) get_in_reply(tweets_final) return tweets_final # Get the interactions between the different users def get_interactions(row): # From every row of the original dataframe # First we obtain the 'user_id' and 'screen_name' user = row["user_id"], row["screen_name"] # Be careful if there is no user id if user[0] is None: return (None, None), [] # The interactions are going to be a set of tuples interactions = set() # Add all interactions # First, we add the interactions corresponding to replies adding the id and screen_name interactions.add( (row["in_reply_to_user_id"], row["in_reply_to_screen_name"])) # After that, we add the interactions with retweets interactions.add( (row["retweeted_id"], row["retweeted_screen_name"])) # And later, the interactions with user mentions interactions.add( (row["user_mentions_id"], row["user_mentions_screen_name"])) # Discard if user id is in interactions interactions.discard((row["user_id"], row["screen_name"])) # Discard all not existing values interactions.discard((None, None)) # Return user and interactions return user, interactions tweets_final = fill_df(tweets_final) tweets_final = tweets_final.where((pd.notnull(tweets_final)), None) graph = nx.Graph() for index, tweet in tweets_final.iterrows(): user, interactions = get_interactions(tweet) user_id, user_name = user tweet_id = int(tweet["id"]) # tweet_sent = tweet["sentiment"] for interaction in interactions: int_id, int_name = interaction graph.add_edge(user_id, int_id, tweet_id=tweet_id) graph.node[user_id]["name"] = user_name graph.node[user_id]["text"] = tweet['text'] graph.node[int_id]["name"] = int_name graph.node[int_id]["text"] = tweet['text'] degrees = [val for (node, val) in graph.degree()] largest_subgraph = max(nx.connected_component_subgraphs(graph), key=len) graph_centrality = nx.degree_centrality(largest_subgraph) max_de = max(graph_centrality.items(), key=itemgetter(1)) graph_closeness = nx.closeness_centrality(largest_subgraph) max_clo = max(graph_closeness.items(), key=itemgetter(1)) graph_betweenness = nx.betweenness_centrality(largest_subgraph, normalized=True, endpoints=False) max_bet = max(graph_betweenness.items(), key=itemgetter(1)) all_bet = sorted(graph_betweenness.items(), key=itemgetter(1)) all_bet.reverse() all_de = sorted(graph_centrality.items(), key=itemgetter(1)) all_de.reverse() ids = [] for de in all_de: # print(graph.node[de[0]]) ids.append(de[0]) # print(de[0]) # print(graph.node[de[0]]['name']) ids = ids[:10] users = api.lookup_users(user_ids=ids) user_list = [] for user in users: user_list.append(user._json) ############################################## influencers = user_list ############################################## # print(hashtag_wordclouds) # print(news_articles) # print(viral_tweets) # print(influencers) hashtags = unique_data[idx] hashtags_len = len(hashtags) # print('hashtags:', hashtags) # hashtags = ['#'+hashtag for hashtag in hashtags] rprts = nexus.reports.find( {"hashtags": { "$size": hashtags_len, "$all": hashtags }}) for r in rprts: report_id = r['id'] print('report_id:', r['id']) print('report:', r) r['hashtag_wordclouds'] = hashtag_wordclouds r['news_articles'] = news_articles r['viral_tweets'] = viral_tweets r['influencers'] = influencers print('updating reports...') nexus.reports.update({'id': report_id}, r, upsert=True)
def insert_notebook(url, screenshot=True, nb=None): """ Returns ------- dict {'success': True/False} """ # TODO: do ajax-based async from web.models import Notebook # sanitize url url = url.replace('https', 'http') is_nbviewer = False try: url = unshorten_url(url) r = requests.get(url) if 'text/html' in r.headers['content-type']: # check that it's a notebook tmp_html = urlopen(url) is_nbviewer = ("Notebook on nbviewer" in tmp_html) if is_nbviewer: html_url = url else: html_url = urlparse.urljoin('http://nbviewer.ipython.org', transform_ipynb_uri(url)) print('Downloading %s' % html_url) html = urlopen(html_url) except (urllib2.HTTPError, urllib2.URLError, socket.timeout, ssl.SSLError, requests.exceptions.SSLError, requests.sessions.InvalidSchema) as e: if nb is not None: nb.failures_access += 1 print('Failed in downloading', e) return {'status': 'failure', 'reason': 'Failed accessing the notebook'} extracted = extraction.Extractor().extract(html, source_url=html_url) if len(extracted.titles) > 1: title = extracted.titles[1] else: title = extracted.descriptions[1] words_title = title.split(' ') if len(words_title) > 20: title = ' '.join(words_title[:20]) + ' ...' if len(extracted.descriptions) > 1: description = extracted.descriptions[1] else: description = '' words_description = description.split(' ') if len(words_description) > 40: description = ' '.join(words_description[:40]) + ' ...' # some more sanitation if title.startswith('This web site does not host'): # this is the nbviewer default title title = 'No title' title = title.strip(u'¶') #similar = Notebook.objects.filter(title=title, description=description) #if len(Notebook.objects.filter(title=title, description=description)) > 0: #return {'status': 'failure', 'reason': 'duplicate document', 'pk': similar[0].pk} if nb is None: obj, created = Notebook.objects.get_or_create(url=url) else: obj = nb created = False # screenshot if screenshot: out = make_screenshots(html_url, obj.pk) if out['status'] == 'failure': if created: obj.delete() else: obj.failures_access += 1 return out else: obj.thumb_img = out['thumb'] # XXX remove assert with error messages assert len(title) < 500 obj.title = title assert len(description) < 2000 obj.description = description assert len(html_url) < 1000 obj.html_url = html_url assert len(url) < 1000 obj.url = url obj.full_html = html obj.last_accessed_date = datetime.now().date() obj.save() return {'status': 'success', 'pk': obj.pk, 'created': created}
with open(fname, 'r') as file: for line in file: urls.append(line[:-1]) print(f"Exracted {len(urls)} URLs from '{fname}'") savefile = False if outname is not None: file = open(outname, 'w') savefile = True for i, url in enumerate(urls): try: print(f"Fetching {i+1} of {len(urls)}: {url} ...") r = requests.get(url) html_data = r.text extracted = extraction.Extractor().extract(html_data) title, desc, link = [extracted.title, extracted.description, extracted.url] title = title + "\n" if title is not None else "" desc = desc + "\n" if desc is not None else "" link = link + f"\n[{url}]\n" if link is not None else f"[{url}]\n" print(title) if savefile: file.write(title + desc + link + "\n") else: print(desc) print(link) print("\n") except RequestException as e:
def getURL1(proxyHost, requestType, urlType): global timeoutTime #print "Testing socks proxy: http://"+proxyHost import socks import socket import urllib2 hostNo = proxyHost.split(":")[0] portNo = proxyHost.split(":")[1] #urlList = [] #urlList.append(["https://www.tracemyip.org/","Trace My IP"]) #urlList.append(["https://www.wikipedia.org/","Wikipedia"]) #urlList.append(["http://whatismyipaddress.com/","What Is My IP Address?"]) global statusCode try: #print "Testing http proxy: http://"+proxyHost headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36' } hostNo = proxyHost.split(":")[0] portNo = proxyHost.split(":")[1] session = requesocks.session() session.timeout = timeoutTime if urlType == "https": urlPosition = urlList[0] session.proxies = {'https': 'https://' + hostNo + ':' + portNo} if urlType == "http": urlPosition = urlList[1] session.proxies = {'http': 'http://' + hostNo + ':' + portNo} url = urlPosition[0] urlTitle = urlPosition[1] if requestType == "get": r = session.get(url) #try: extracted = extraction.Extractor().extract(r.text, source_url=url) if urlTitle not in extracted.title: return proxyHost + "\t" + urlType + "\t503" elif requestType == "head": r = session.head(url) statusCode = str(r.status_code) result2 = proxyHost + "\t" + urlType + "\t" + statusCode if statusCode != "200": result1 = testSocks4(proxyHost, urlType) if "503" in str(result1): #if options.v: # print result result = testSocks5(proxyHost, urlType) return result2 + "\n" + results1 + "\n" + result else: return result1 else: return proxyHost + "\t" + urlType + "\t" + statusCode except requests.exceptions.ConnectionError as e: return proxyHost + "\t" + urlType + "\t503" except Exception as e: result2 = proxyHost + "\t" + urlType + "\t503" #if options.v: # print proxyHost+"\t"+urlType+"\t503" result1 = testSocks4(proxyHost, urlType) if "503" in str(result1): #if options.v: # print result result = testSocks5(proxyHost, urlType) #if options.v: # print result return result2 + "\n" + result1 + "\n" + result else: if options.v: print result1 return result1
def extractd(url): print(type(url)) html = requests.get(url['url']).text extracted = extraction.Extractor().extract(html, source_url=url['url']) return extracted
def post(self, site): linkID = request.args.get('id', None) title = '' description = '' if 'title' in request.form: title = request.form['title'] if 'description' in request.form: description = request.form['description'] url = request.form['url'] print('here') public = False if request.form.get('public') != None: public = True session = Session() link = session.query(Link).filter_by(site=site).filter_by( id=linkID).first() if link is None: link = Link(site=site) else: link.title = title link.description = description if link.url != url: # Do content generation link.url = url # Fetch website html content urlsession = requests.Session() retry = Retry(connect=3, backoff_factor=2.0) adapter = HTTPAdapter(max_retries=retry) urlsession.mount('http://', adapter) urlsession.mount('https://', adapter) html = urlsession.get(link.url).text # Extract title, description, and preview image from meta-tags extracted = extraction.Extractor().extract(html, source_url=url) title = extracted.title or '' descrip = extracted.description or '' imgurl = extracted.image or '' # Fetch image data data and resize for s3 img_data = requests.get(imgurl).content if img_data is None: return url img = Image.open(io.BytesIO(img_data)) width, height = img.size filename = secure_filename(alphaNumericID()) imgSize = (1000, 1000) originalImage = resizeIOImage(img_data, (width, height)) resizedImage = resizeIOImage(img_data, imgSize) # Upload to s3 uploadImage(originalImage, "%soriginal" % filename) uploadImage(resizedImage, filename) original_url = "{}{}original.jpeg".format(S3_LOCATION, filename) large_url = "{}{}.jpeg".format(S3_LOCATION, filename) # Save website content link.title = title link.description = descrip link.source_url = original_url link.large_url = large_url link.public = public session.add(link) session.commit() linkID = link.id session.close() track_activity('Updated news link', linkID, 'link', site) return redirect(url_for('Links_view', id=linkID, site=site))