class TextFromURL: def __init__(self): self.alchemy_api = AlchemyAPI() def extract_text(self, url_param): """ This method takes a URL and extracts the text contained in the page referenced by the URL using the alchemy text service. :param url_param: :return: Returns the parsed text """ self.url = url_param self.response = self.alchemy_api.text('url', self.url) if self.response['status'] == 'OK': json.dumps(self.response, indent=4) self.parsed_text = self.response['text'].encode('utf-8') else: print('Error in text extraction call: ', self.response['statusInfo']) return (self.parsed_text) def extract_text_by_element_id(self, url_param, html_element_id): """ This method extracts text from a webpage based on the ID of an element within the HTML code. It uses beautiful soup to parse the page and extract the content :param url_param: :param html_element_id: ID of the HTML element where the desired text is located """ f = urllib.urlopen(url_param) website = f.read() soup = BeautifulSoup(website, 'html.parser') self.parsed_text = soup.find(id=html_element_id).get_text() return(self.parsed_text)
def render_article(request): #if current aricle has content field #render as is #else call alchemy and save content article_id = request.POST['articleData'] article = Article.objects.filter(id = article_id)[0] print(article_id.encode('utf-8')) print(article.content.encode('utf-8')) if article.content: return render_to_response('article.html', {'id' : article.id, 'data' : article.content, 'titleText' : article.title}) else: testURL = article.url #Create AlchemyAPI Object alchemyapi = AlchemyAPI() response = alchemyapi.text('url', testURL) titleData = alchemyapi.title('url', testURL) authorData = alchemyapi.author('url', testURL) article.content = response['text'].encode('utf-8') article.title = titleData['title'].encode('utf-8') article.save() return render_to_response('article.html', {'id' : article.id, 'data' : response['text'].encode('utf-8'), 'titleText' : titleData['title'].encode('utf-8')} )
def _extract_content_alchemy(self, url): alchemyapi = AlchemyAPI() response = alchemyapi.text('url', url) content = '' if response['status'] == 'OK': content = response['text'].encode('utf8') return content
def connect_alchemy(url): # to connect with alchemy and tag the content from alchemyapi import AlchemyAPI alchemyapi = AlchemyAPI() resp = alchemyapi.text('url', url) response = alchemyapi.keywords("text", resp['text']) keywors = response["keywords"]
def textScrape(url): from alchemyapi import AlchemyAPI import json """ Before running this function, you must cd into the folder this function is stored in and run: python alchemyapi.py <YOUR_API_KEY> After doing this, the alchmyapi will use your api key and function normally. DO NOT POST THE FILE IT CREATES TO GIT. """ alchemyapi = AlchemyAPI() rawText = alchemyapi.text('url', url)['text'] result = json.dumps(rawText) return str(result)
class App: def __init__(self): self.alchemyapi = AlchemyAPI() self.raw_text = '' self.concepts = None self.keywords = None def parse_url(self, url=None): text_response = self.alchemyapi.text('url', url) if text_response['status'] == 'OK': self.raw_text = text_response['text'].encode('utf-8') else: print('Error in text extraction call: ', text_response['statusInfo']) def extract_concepts(self): concept_response = self.alchemyapi.concepts('text', self.raw_text) if concept_response['status'] == 'OK': self.concepts = concept_response['concepts'] # print('## Concepts ##') # for concept in self.concepts: # print('text: ', concept['text']) # print('relevance: ', concept['relevance']) # print('') else: print('Error in concept tagging call: ', concept_response['statusInfo']) def extract_keywords(self): keyword_response = self.alchemyapi.keywords('text', self.raw_text, {'sentiment': 1}) if keyword_response['status'] == 'OK': self.keywords = keyword_response['keywords'] # print('') # print('## Keywords ##') # for keyword in self.keywords: # print('text: ', keyword['text'].encode('utf-8')) # print('relevance: ', keyword['relevance']) # print('sentiment: ', keyword['sentiment']['type']) # if 'score' in keyword['sentiment']: # print('sentiment score: ' + keyword['sentiment']['score']) # print('') else: print('Error in keyword extraction call: ', keyword_response['statusInfo']) def define_concepts(self): for concept in self.concepts: definition = duckduckgo.get_zci(concept['text']) print('%s -> %s' % (concept['text'], definition)) print('') def define_keywords(self): for keyword in self.keywords: definition = duckduckgo.get_zci(keyword['text']) print('%s -> %s' % (keyword['text'], definition)) print('')
class Alchemy(object): 'Chama API para leitura de feeds Atom RSS' def __init__(self): #Chamador do AlchemyAPI self.alchemy_api = AlchemyAPI() def processa_html(self, link): #Retorna o texto limpo a partir de uma URL return self.alchemy_api.text('url', link)['text'] def obtem_titulo(self, link): #Retorna o texto limpo a partir de uma URL return self.alchemy_api.title('url', link)['title'] def obtem_entidades(self, texto): #Retorna as entidaades encontradas no texto return self.alchemy_api.entities('text', texto, {'sentiment': 1})
response = alchemyapi.sentiment_targeted('html', test_html, 'language'); assert(response['status'] == 'OK') response = alchemyapi.sentiment_targeted('url', test_url, 'Congress'); assert(response['status'] == 'OK') response = alchemyapi.sentiment_targeted('random', test_url, 'Congress'); assert(response['status'] == 'ERROR') #invalid flavor response = alchemyapi.sentiment_targeted('text', test_text, None); assert(response['status'] == 'ERROR') #missing target print('Targeted sentiment tests complete!') print('') #Text print('Checking text . . . ') response = alchemyapi.text('text', test_text); assert(response['status'] == 'ERROR') #only works for html and url content response = alchemyapi.text('html', test_html); assert(response['status'] == 'OK') response = alchemyapi.text('url', test_url); assert(response['status'] == 'OK') print('Text tests complete!') print('') #Text Raw print('Checking raw text . . . ') response = alchemyapi.text_raw('text', test_text); assert(response['status'] == 'ERROR') #only works for html and url content response = alchemyapi.text_raw('html', test_html);
wait = raw_input('press enter to continue') print('') print('') print('') print('############################################') print('# Text Extraction Example #') print('############################################') print('') print('') print('Processing url: ', demo_url) print('') response = alchemyapi.text('url', demo_url) if response['status'] == 'OK': print('## Response Object ##') print(json.dumps(response, indent=4)) print('') print('## Text ##') print('text: ', response['text'].encode('utf-8')) print('') else: print('Error in text extraction call: ', response['statusInfo']) wait = raw_input('press enter to continue') print('')
import pymongo import newspaper import json # connect to database connection = pymongo.MongoClient("mongodb://localhost") alchemyapi = AlchemyAPI() # db = connection.test # alchemyData = db.alchemyData # cnn_paper = newspaper.build('http://cnn.com', memoize_articles=False) # building source # url = cnn_paper.articles[1].url # getting url url = 'http://www.cnn.com/interactive/2015/08/health/elizabeth-explains-allergies/' response = alchemyapi.text('url', url) article_text = "" if response['status'] == 'OK': # print('## Response Object ##') # print(json.dumps(response, indent=4)) # print('') # print('## Text ##') # print('text: ', response['text'].encode('utf-8')) article_text = response['text'].encode('utf-8') # print('') else: print('Error in text extraction call: ', response['statusInfo'])
response = alchemyapi.sentiment_targeted('text', test_text, 'heart') assert (response['status'] == 'OK') response = alchemyapi.sentiment_targeted('html', test_html, 'language') assert (response['status'] == 'OK') response = alchemyapi.sentiment_targeted('url', test_url, 'Congress') assert (response['status'] == 'OK') response = alchemyapi.sentiment_targeted('random', test_url, 'Congress') assert (response['status'] == 'ERROR') #invalid flavor response = alchemyapi.sentiment_targeted('text', test_text, None) assert (response['status'] == 'ERROR') #missing target print('Targeted sentiment tests complete!') print('') #Text print('Checking text . . . ') response = alchemyapi.text('text', test_text) assert (response['status'] == 'ERROR') #only works for html and url content response = alchemyapi.text('html', test_html) assert (response['status'] == 'OK') response = alchemyapi.text('url', test_url) assert (response['status'] == 'OK') print('Text tests complete!') print('') #Text Raw print('Checking raw text . . . ') response = alchemyapi.text_raw('text', test_text) assert (response['status'] == 'ERROR') #only works for html and url content response = alchemyapi.text_raw('html', test_html) assert (response['status'] == 'OK') response = alchemyapi.text_raw('url', test_url)
import StringIO import re import csv alchemyapi = AlchemyAPI() # total 318 pages for num in range(1, 318+1): # concatenate url from 1st to the 318th page test_url = 'http://zipatlas.com/us/zip-code-comparison/unemployment-rate.' if num > 1: test_url = test_url + str(num) + '.' test_url = test_url + 'htm' print('Checking text . . . ' + str(num)) # use alchemyapi to extract the text in the webpage response = alchemyapi.text('url', test_url) #assert(response['status'] == 'OK') print('Text tests complete!' + str(num)) # decorate the response text into the file type text = StringIO.StringIO(response.get('text')) # iterate each line of text to remove characters pattern = re.compile(r'[A-Za-z]{1,}') with open('unemployment.txt', 'a') as datafile: for line in iter(text): newline = pattern.sub('', line) datafile.write(newline) # with open('eggs.csv', 'a') as csvfile: # spamwriter = csv.writer(csvfile, delimiter=' ', quoting=csv.QUOTE_NONE, escapechar=' ') # for line in iter(text): # newline = pattern.sub('', line)
doc_sentiment = None doc_score = None doc_mixed = None if doc_response['status'] == 'OK': doc_sentiment = doc_response['docSentiment']['type'] if 'score' in doc_response['docSentiment']: doc_score = doc_response['docSentiment']['score'] if 'mixed' in doc_response['docSentiment']: doc_mixed = doc_response['docSentiment']['mixed'] else: print j + i + 1, 'Error in sentiment analysis call: ', doc_response[ 'statusInfo'] # Getting plain text of the web page content_text = "" text_response = alchemyapi.text('url', response['items'][i]['link']) if text_response['status'] == 'OK': content_text = text_response['text'] else: print j + i + 1, 'Error in text extraction call: ', text_response[ 'statusInfo'] # Getting current data and time date_time = datetime.datetime.now() # Store data into database title = response['items'][i]['title'] link = response['items'][i]['link'] # Debug code to show size of web page HTML and text # print sys.getsizeof(content_HTML)
class Extraction: def __init__(self, url): self.alchemyAPI = AlchemyAPI() self.alchemyAPI.outputMode = 'json' self.url = url # must call extraction after initialization """ Goes through all URL processing routines for the constructor-specified URL """ def processText(self): text = self.__extractText(self.url) self.sentences = self.__sbdText(text) self.author = self.__extractAuthor(self.url) self.title = self.__extractTitle(self.url) """ Calls AlchemyAPI to extract the text from the given article """ def __extractText(self, url): if url is None or url == "": raise InputException("Invalid URL") response = self.alchemyAPI.text('url', url) if response['status'] != 'OK': warn(response['statusInfo']) return response['text'].encode('utf-8') """ Calls AlchemyAPI to extract the author of the article. """ def __extractAuthor(self, url): if url is None or url == "": raise InputException("Invalid URL") response = self.alchemyAPI.author('url', url) if response['status'] != 'OK': warn(response['statusInfo']) return response['author'].encode('utf-8') """ Gets the article title with """ def __extractTitle(self, url): if url is None or url == "": raise InputException("Invalid URL") response = self.alchemyAPI.title('url', url) if response['status'] != 'OK': warn(response['statusInfo']) return response['title'].encode('utf-8') """ Applies a sentence boundary disambiguation algorithm to the extracted article text. We then have access to the individual sentences of the article. From there any quotes are removed, so sentiment analysis is performed on the writer's additions only. """ def __sbdText(self, extractedText): import re sentenceEnders = re.compile(r""" # Split sentences on whitespace between them. (?: # Group for two positive lookbehinds. (?<=[.!?]) # Either an end of sentence punct, | (?<=[.!?]['"]) # or end of sentence punct and quote. ) # End group of two positive lookbehinds. (?<! Mr\. ) # Don't end sentence on "Mr." (?<! Mrs\. ) # Don't end sentence on "Mrs." (?<! Jr\. ) # Don't end sentence on "Jr." (?<! Dr\. ) # Don't end sentence on "Dr." (?<! Prof\. ) # Don't end sentence on "Prof." (?<! Sr\. ) # Don't end sentence on "Sr." \s+ # Split on whitespace between sentences. """, re.IGNORECASE | re.VERBOSE) sentenceList = sentenceEnders.split(extractedText) """ remove any quotes by recognizing ascii/unicode double sentences. any quotes within sentences are left, because this paraphrasing/choice is still somewhat indicative of possible bias """ for sentence in list(sentenceList): if sentence[:3] == "“" or sentence[:1] == '"': # “ = unicode representation of slanted double quote sentenceList.remove(sentence) return sentenceList
print('') print('') print('') print('############################################') print('# Text Extraction Example #') print('############################################') print('') print('') print('Processing url: ', demo_url) print('') response = alchemyapi.text('url',demo_url) if response['status'] == 'OK': print('## Response Object ##') print(json.dumps(response, indent=4)) print('') print('## Text ##') print('text: ', response['text'].encode('utf-8')) print('') else: print('Error in text extraction call: ', response['statusInfo']) print('')
print("Error in targeted sentiment analysis call: ", response["statusInfo"]) print("") print("") print("") print("############################################") print("# Text Extraction Example #") print("############################################") print("") print("") print("Processing url: ", demo_url) print("") response = alchemyapi.text("url", demo_url) if response["status"] == "OK": print("## Response Object ##") print(json.dumps(response, indent=4)) print("") print("## Text ##") print("text: ", response["text"].encode("utf-8")) print("") else: print("Error in text extraction call: ", response["statusInfo"]) print("") print("")