Ejemplo n.º 1
0
class TextFromURL:
    def __init__(self):
        self.alchemy_api = AlchemyAPI()

    def extract_text(self, url_param):
        """
        This method takes a URL and extracts the text contained in the page referenced by the URL using the alchemy
        text service.

        :param url_param:
        :return: Returns the parsed text
        """
        self.url = url_param
        self.response = self.alchemy_api.text('url', self.url)
        if self.response['status'] == 'OK':
            json.dumps(self.response, indent=4)
            self.parsed_text = self.response['text'].encode('utf-8')
        else:
            print('Error in text extraction call: ', self.response['statusInfo'])
        return (self.parsed_text)

    def extract_text_by_element_id(self, url_param, html_element_id):
        """
        This method extracts text from a webpage based on the ID of an element within the HTML code. It uses beautiful
        soup to parse the page and extract the content

        :param url_param:
        :param html_element_id: ID of the HTML element where the desired text is located
        """
        f = urllib.urlopen(url_param)
        website = f.read()
        soup = BeautifulSoup(website, 'html.parser')
        self.parsed_text = soup.find(id=html_element_id).get_text()
        return(self.parsed_text)
Ejemplo n.º 2
0
def render_article(request):	
	#if current aricle has content field
	#render as is
	#else call alchemy and save content

	article_id = request.POST['articleData']
	article = Article.objects.filter(id = article_id)[0]

	print(article_id.encode('utf-8'))
	print(article.content.encode('utf-8'))
	if article.content:
		return render_to_response('article.html', {'id' : article.id, 'data' : article.content, 'titleText' : article.title})
	else:				
		testURL = article.url
		#Create AlchemyAPI Object
		alchemyapi = AlchemyAPI()
		response = alchemyapi.text('url', testURL)
		titleData = alchemyapi.title('url', testURL)
		authorData = alchemyapi.author('url', testURL)
		article.content = response['text'].encode('utf-8')
		article.title = titleData['title'].encode('utf-8')
		article.save()

		return render_to_response('article.html', {'id' : article.id, 'data' : response['text'].encode('utf-8'), 'titleText' : titleData['title'].encode('utf-8')}
 )
Ejemplo n.º 3
0
 def _extract_content_alchemy(self, url):
     alchemyapi = AlchemyAPI()
     response = alchemyapi.text('url', url)
     content = ''
     if response['status'] == 'OK':
         content = response['text'].encode('utf8')
     return content
Ejemplo n.º 4
0
def connect_alchemy(url):
	# to connect with alchemy and tag the content 
	from alchemyapi import AlchemyAPI
	alchemyapi 	= AlchemyAPI()

	resp       	= alchemyapi.text('url', url)

	response 	= alchemyapi.keywords("text", resp['text'])

	keywors = response["keywords"]
Ejemplo n.º 5
0
def connect_alchemy(url):
    # to connect with alchemy and tag the content
    from alchemyapi import AlchemyAPI
    alchemyapi = AlchemyAPI()

    resp = alchemyapi.text('url', url)

    response = alchemyapi.keywords("text", resp['text'])

    keywors = response["keywords"]
def textScrape(url):
	from alchemyapi import AlchemyAPI
	import json
	"""
	Before running this function, you must cd into the folder this function is stored in and run: python alchemyapi.py <YOUR_API_KEY>
	After doing this, the alchmyapi will use your api key and function normally. DO NOT POST THE FILE IT CREATES TO GIT.
	"""
	alchemyapi = AlchemyAPI()
	rawText = alchemyapi.text('url', url)['text']
	result = json.dumps(rawText)
	return str(result)
Ejemplo n.º 7
0
class App:
    def __init__(self):
        self.alchemyapi = AlchemyAPI()
        self.raw_text = ''
        self.concepts = None
        self.keywords = None

    def parse_url(self, url=None):
        text_response = self.alchemyapi.text('url', url)
        if text_response['status'] == 'OK':
            self.raw_text =  text_response['text'].encode('utf-8')
        else:
            print('Error in text extraction call: ', text_response['statusInfo'])

    def extract_concepts(self):
        concept_response = self.alchemyapi.concepts('text', self.raw_text)
        if concept_response['status'] == 'OK':
            self.concepts = concept_response['concepts']
            # print('## Concepts ##')
            # for concept in self.concepts:
            #     print('text: ', concept['text'])
            #     print('relevance: ', concept['relevance'])
            #     print('')
        else:
            print('Error in concept tagging call: ', concept_response['statusInfo'])

    def extract_keywords(self):
        keyword_response = self.alchemyapi.keywords('text', self.raw_text, {'sentiment': 1})
        if keyword_response['status'] == 'OK':
            self.keywords = keyword_response['keywords']
            # print('')
            # print('## Keywords ##')
            # for keyword in self.keywords:
            #     print('text: ', keyword['text'].encode('utf-8'))
            #     print('relevance: ', keyword['relevance'])
            #     print('sentiment: ', keyword['sentiment']['type'])
            #     if 'score' in keyword['sentiment']:
            #         print('sentiment score: ' + keyword['sentiment']['score'])
            #     print('')
        else:
            print('Error in keyword extraction call: ', keyword_response['statusInfo'])

    def define_concepts(self):
        for concept in self.concepts:
            definition = duckduckgo.get_zci(concept['text'])
            print('%s -> %s' % (concept['text'], definition))
            print('')

    def define_keywords(self):
        for keyword in self.keywords:
            definition = duckduckgo.get_zci(keyword['text'])
            print('%s -> %s' % (keyword['text'], definition))
            print('')
Ejemplo n.º 8
0
class Alchemy(object):
  'Chama API para leitura de feeds Atom RSS'

  def __init__(self):
    #Chamador do AlchemyAPI
    self.alchemy_api = AlchemyAPI()

  def processa_html(self, link):

    #Retorna o texto limpo a partir de uma URL
    return self.alchemy_api.text('url', link)['text']

  def obtem_titulo(self, link):

    #Retorna o texto limpo a partir de uma URL
    return self.alchemy_api.title('url', link)['title']

  def obtem_entidades(self, texto):

    #Retorna as entidaades encontradas no texto
    return self.alchemy_api.entities('text', texto, {'sentiment': 1})
Ejemplo n.º 9
0
response = alchemyapi.sentiment_targeted('html', test_html, 'language');
assert(response['status'] == 'OK')
response = alchemyapi.sentiment_targeted('url', test_url, 'Congress');
assert(response['status'] == 'OK')
response = alchemyapi.sentiment_targeted('random', test_url, 'Congress');
assert(response['status'] == 'ERROR') 	#invalid flavor
response = alchemyapi.sentiment_targeted('text', test_text,  None);
assert(response['status'] == 'ERROR') 	#missing target
print('Targeted sentiment tests complete!')
print('')



#Text
print('Checking text . . . ')
response = alchemyapi.text('text', test_text);
assert(response['status'] == 'ERROR')	#only works for html and url content
response = alchemyapi.text('html', test_html);
assert(response['status'] == 'OK')
response = alchemyapi.text('url', test_url);
assert(response['status'] == 'OK')
print('Text tests complete!')
print('')



#Text Raw
print('Checking raw text . . . ')
response = alchemyapi.text_raw('text', test_text);
assert(response['status'] == 'ERROR')	#only works for html and url content
response = alchemyapi.text_raw('html', test_html);
Ejemplo n.º 10
0
wait = raw_input('press enter to continue')

print('')
print('')
print('')
print('############################################')
print('#   Text Extraction Example                #')
print('############################################')
print('')
print('')

print('Processing url: ', demo_url)
print('')

response = alchemyapi.text('url', demo_url)

if response['status'] == 'OK':
    print('## Response Object ##')
    print(json.dumps(response, indent=4))

    print('')
    print('## Text ##')
    print('text: ', response['text'].encode('utf-8'))
    print('')
else:
    print('Error in text extraction call: ', response['statusInfo'])

wait = raw_input('press enter to continue')

print('')
Ejemplo n.º 11
0
import pymongo
import newspaper
import json

# connect to database
connection = pymongo.MongoClient("mongodb://localhost")
alchemyapi = AlchemyAPI()

# db = connection.test
# alchemyData = db.alchemyData

# cnn_paper = newspaper.build('http://cnn.com', memoize_articles=False) # building source
# url = cnn_paper.articles[1].url	# getting url
url = 'http://www.cnn.com/interactive/2015/08/health/elizabeth-explains-allergies/'

response = alchemyapi.text('url', url)
article_text = ""

if response['status'] == 'OK':
    # print('## Response Object ##')
    # print(json.dumps(response, indent=4))

    # print('')
    # print('## Text ##')
    # print('text: ', response['text'].encode('utf-8'))
    article_text = response['text'].encode('utf-8')
    # print('')
else:
    print('Error in text extraction call: ', response['statusInfo'])

Ejemplo n.º 12
0
response = alchemyapi.sentiment_targeted('text', test_text, 'heart')
assert (response['status'] == 'OK')
response = alchemyapi.sentiment_targeted('html', test_html, 'language')
assert (response['status'] == 'OK')
response = alchemyapi.sentiment_targeted('url', test_url, 'Congress')
assert (response['status'] == 'OK')
response = alchemyapi.sentiment_targeted('random', test_url, 'Congress')
assert (response['status'] == 'ERROR')  #invalid flavor
response = alchemyapi.sentiment_targeted('text', test_text, None)
assert (response['status'] == 'ERROR')  #missing target
print('Targeted sentiment tests complete!')
print('')

#Text
print('Checking text . . . ')
response = alchemyapi.text('text', test_text)
assert (response['status'] == 'ERROR')  #only works for html and url content
response = alchemyapi.text('html', test_html)
assert (response['status'] == 'OK')
response = alchemyapi.text('url', test_url)
assert (response['status'] == 'OK')
print('Text tests complete!')
print('')

#Text Raw
print('Checking raw text . . . ')
response = alchemyapi.text_raw('text', test_text)
assert (response['status'] == 'ERROR')  #only works for html and url content
response = alchemyapi.text_raw('html', test_html)
assert (response['status'] == 'OK')
response = alchemyapi.text_raw('url', test_url)
Ejemplo n.º 13
0
import StringIO
import re
import csv
alchemyapi = AlchemyAPI()

# total 318 pages
for num in range(1, 318+1):
	# concatenate url from 1st to the 318th page
	test_url = 'http://zipatlas.com/us/zip-code-comparison/unemployment-rate.'
	if num > 1:
		test_url = test_url + str(num) + '.'
	test_url = test_url + 'htm'
	print('Checking text . . . ' + str(num))
	# use alchemyapi to extract the text in the webpage
	response = alchemyapi.text('url', test_url)
	#assert(response['status'] == 'OK')
	print('Text tests complete!' + str(num))
	# decorate the response text into the file type
	text = StringIO.StringIO(response.get('text'))
	# iterate each line of text to remove characters
	pattern = re.compile(r'[A-Za-z]{1,}')
	with open('unemployment.txt', 'a') as datafile:
                for line in iter(text):
			newline = pattern.sub('', line)
			datafile.write(newline)	

#	with open('eggs.csv', 'a') as csvfile:
#    		spamwriter = csv.writer(csvfile, delimiter=' ', quoting=csv.QUOTE_NONE, escapechar=' ')
#		for line in iter(text):
#			newline = pattern.sub('', line)
Ejemplo n.º 14
0
        doc_sentiment = None
        doc_score = None
        doc_mixed = None
        if doc_response['status'] == 'OK':
            doc_sentiment = doc_response['docSentiment']['type']
            if 'score' in doc_response['docSentiment']:
                doc_score = doc_response['docSentiment']['score']
            if 'mixed' in doc_response['docSentiment']:
                doc_mixed = doc_response['docSentiment']['mixed']
        else:
            print j + i + 1, 'Error in sentiment analysis call: ', doc_response[
                'statusInfo']

        # Getting plain text of the web page
        content_text = ""
        text_response = alchemyapi.text('url', response['items'][i]['link'])
        if text_response['status'] == 'OK':
            content_text = text_response['text']
        else:
            print j + i + 1, 'Error in text extraction call: ', text_response[
                'statusInfo']

        # Getting current data and time
        date_time = datetime.datetime.now()

        # Store data into database
        title = response['items'][i]['title']
        link = response['items'][i]['link']

        # Debug code to show size of web page HTML and text
        #		print sys.getsizeof(content_HTML)
Ejemplo n.º 15
0
class Extraction:
    def __init__(self, url):
        self.alchemyAPI = AlchemyAPI()
        self.alchemyAPI.outputMode = 'json'
        self.url = url
        # must call extraction after initialization

    """
    Goes through all URL processing routines for the constructor-specified URL
    """
    def processText(self):
        text = self.__extractText(self.url)
        self.sentences = self.__sbdText(text)
        self.author    = self.__extractAuthor(self.url)
        self.title     = self.__extractTitle(self.url)


    """
    Calls AlchemyAPI to extract the text from the given article
    """
    def __extractText(self, url):
        if url is None or url == "":
            raise InputException("Invalid URL")

        response = self.alchemyAPI.text('url', url)
        if response['status'] != 'OK':
            warn(response['statusInfo'])

        return response['text'].encode('utf-8')

    """
    Calls AlchemyAPI to extract the author of the article.
    """
    def __extractAuthor(self, url):
        if url is None or url == "":
            raise InputException("Invalid URL")

        response = self.alchemyAPI.author('url', url)
        if response['status'] != 'OK':
            warn(response['statusInfo'])

        return response['author'].encode('utf-8')

    """
    Gets the article title with
    """
    def __extractTitle(self, url):
        if url is None or url == "":
            raise InputException("Invalid URL")

        response = self.alchemyAPI.title('url', url)
        if response['status'] != 'OK':
            warn(response['statusInfo'])
        return response['title'].encode('utf-8')


    """
    Applies a sentence boundary disambiguation algorithm to the extracted
    article text. We then have access to the individual sentences of the article.
    From there any quotes are removed, so sentiment analysis is performed on the writer's
    additions only.
    """
    def __sbdText(self, extractedText):
        import re
        sentenceEnders = re.compile(r"""
            # Split sentences on whitespace between them.
            (?:               # Group for two positive lookbehinds.
              (?<=[.!?])      # Either an end of sentence punct,
            | (?<=[.!?]['"])  # or end of sentence punct and quote.
            )                 # End group of two positive lookbehinds.
            (?<!  Mr\.   )    # Don't end sentence on "Mr."
            (?<!  Mrs\.  )    # Don't end sentence on "Mrs."
            (?<!  Jr\.   )    # Don't end sentence on "Jr."
            (?<!  Dr\.   )    # Don't end sentence on "Dr."
            (?<!  Prof\. )    # Don't end sentence on "Prof."
            (?<!  Sr\.   )    # Don't end sentence on "Sr."
            \s+               # Split on whitespace between sentences.
            """,
        re.IGNORECASE | re.VERBOSE)
        sentenceList = sentenceEnders.split(extractedText)

        """
        remove any quotes by recognizing ascii/unicode double sentences.
        any quotes within sentences are left, because this paraphrasing/choice
        is still somewhat indicative of possible bias
        """
        for sentence in list(sentenceList):
            if sentence[:3] == "“" or sentence[:1] == '"': # “ = unicode representation of slanted double quote
                sentenceList.remove(sentence)

        return sentenceList
Ejemplo n.º 16
0


print('')
print('')
print('')
print('############################################')
print('#   Text Extraction Example                #')
print('############################################')
print('')
print('')

print('Processing url: ', demo_url)
print('')

response = alchemyapi.text('url',demo_url)

if response['status'] == 'OK':
	print('## Response Object ##')
	print(json.dumps(response, indent=4))

	print('')
	print('## Text ##')
	print('text: ', response['text'].encode('utf-8'))
	print('')
else:
	print('Error in text extraction call: ', response['statusInfo'])



print('')
Ejemplo n.º 17
0
    print("Error in targeted sentiment analysis call: ", response["statusInfo"])


print("")
print("")
print("")
print("############################################")
print("#   Text Extraction Example                #")
print("############################################")
print("")
print("")

print("Processing url: ", demo_url)
print("")

response = alchemyapi.text("url", demo_url)

if response["status"] == "OK":
    print("## Response Object ##")
    print(json.dumps(response, indent=4))

    print("")
    print("## Text ##")
    print("text: ", response["text"].encode("utf-8"))
    print("")
else:
    print("Error in text extraction call: ", response["statusInfo"])


print("")
print("")