Ejemplo n.º 1
0
def load_positivity():
    '''
    Loads positivity data from Stanford IMDB dataset into memory as two lists: one text and the other
    binary 1 = positive, 0 = negative.
    
    This function concatenates the presorted testing and training data into one group.
    
    Output:
        quotes (text data), positivity (binary data)
    
    '''
    
    quotes, positivity = [], []
    
    for path in ['../Data/Training_Data/positivity/train/', './Data/Training_Data/positivity/test/']:
        # load all positive quotes
        for filename in os.listdir(path + 'pos/'):
            if(re.search(u'.txt', filename) is not None):
                with open(path + 'pos/' + filename) as f:
                    data="".join(line.rstrip() for line in f)
                quotes.append(plaintext(data))
                positivity.append(1)
        
        # load all negative quotes
        for filename in os.listdir(path + 'neg/'):
            if(re.search(u'.txt', filename) is not None):
                with open(path + 'neg/' + filename) as f:
                    data="".join(line.rstrip() for line in f)
                quotes.append(plaintext(data))
                positivity.append(0)
    
    return quotes, np.array(positivity)
Ejemplo n.º 2
0
def get_pattern_data(search_param):

    twitter = Twitter(language='en')

    for tweet in twitter.search(search_param, cached=True):
        print(plaintext(tweet.text).encode('ascii', 'ignore').decode('utf-8'))

    g = Graph()
    for i in range(10):
        for result in twitter.search(search_param, start=i + 1, count=50):
            s = result.text.lower()
            s = plaintext(s)
            s = parsetree(s)
            p = '{NP} (VP) ' + search_param + ' {NP}'
            for m in search(p, s):
                x = m.group(1).string  # NP left
                y = m.group(2).string  # NP right
                if x not in g:
                    g.add_node(x)
                    if y not in g:
                        g.add_node(y)
                    g.add_edge(g[x], g[y], stroke=(0, 0, 0, 0.75))  # R,G,B,A

    #if len(g)>0:
    #   g = g.split()[0] # Largest subgraph.

    for n in g.sorted()[:40]:  # Sort by Node.weight.
        n.fill = (0, 0.5, 1, 0.75 * n.weight)

    g.export('data', directed=False, weighted=0.6)
Ejemplo n.º 3
0
	def construct_threads(self):
		for i in self.dump.by_tag("div.thread"):
			cur_thread = msg_classes.Thread()
			cur_thread.p1 = self.p1
			thread_exists = False
			if plaintext(i.by_tag("span.profile fn")[0].content) == self.p1: 
				cur_thread.p2 = plaintext(i.by_tag("span.profile fn")[1].content)
			else:
				cur_thread.p2 = plaintext(i.by_tag("span.profile fn")[0].content)
			# TODO if p1 and p2 have the same name, error!
			# assert cur_thread.p1 != cur_thread.p2 
			for e in i.by_tag("div.message"):
				cur_thread.add_message(
						plaintext(e.by_tag("div.from")[0].content).encode("utf-8"), 
						e.by_tag("abbr.time published")[0].attributes['title'].encode("utf-8"),
						plaintext(e.by_tag("div.msgbody")[0].content).encode("utf-8")
						)
			cur_thread.construct_conversations() 
			for t in self.threads:
				if t.p2 == cur_thread.p2:
					thread_exists = True 
					t.combine(cur_thread)

			if not thread_exists:
				self.threads.append(cur_thread) 
def get_pattern_data(search_param):
   
   twitter = Twitter(language='en') 
   
   for tweet in twitter.search(search_param, cached=True):
      print(plaintext(tweet.text).encode('ascii', 'ignore').decode('utf-8'))
   

   g = Graph()
   for i in range(10):
      for result in twitter.search(search_param, start=i+1,count=50):
         s = result.text.lower() 
         s = plaintext(s)
         s = parsetree(s)
         p = '{NP} (VP) ' +search_param+ ' {NP}'
         for m in search(p, s):
            x = m.group(1).string # NP left
            y = m.group(2).string # NP right
            if x not in g:
               g.add_node(x)
               if y not in g:
                  g.add_node(y)
               g.add_edge(g[x], g[y], stroke=(0,0,0,0.75)) # R,G,B,A

   #if len(g)>0:   
   #   g = g.split()[0] # Largest subgraph.

   for n in g.sorted()[:40]: # Sort by Node.weight.
      n.fill = (0, 0.5, 1, 0.75 * n.weight)

   g.export('data', directed=False, weighted=0.6)
Ejemplo n.º 5
0
def get_patent(url):
    url = URL(url + "/fulltext")
    html = url.download()
    dom = DOM(html)
    title = plaintext(dom("h3 a")[0].content)
    body = plaintext(dom("#contents")[0].content)
    return [title, body]
Ejemplo n.º 6
0
def google_search(match,targetfile):
    engine = Google(license=None)
    for i in range(1,10):
        for result in engine.search(match, type=SEARCH, start=i):
              print plaintext(result.description)
              targetfile.write(plaintext(result.description))
              targetfile.write('\n')
Ejemplo n.º 7
0
def google_search(match, targetfile):
    engine = Google(license=None)
    for i in range(1, 10):
        for result in engine.search(match, type=SEARCH, start=i):
            print plaintext(result.description)
            targetfile.write(plaintext(result.description))
            targetfile.write('\n')
Ejemplo n.º 8
0
 def searchQueriesWithPatterns(self, queries):
     # Faster, but still need to refine extraction patterns
     results = []
     for q, w1, d in queries:
         print "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
         print q
         print "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
         for r in self._engine.search(q, count=50):
             # Each result is given a preliminary score based on the weight of the query
             # that retrieved it and the pattern that was matched to it
             for p, w2 in self.getPatterns(q):
                 if d == 'L':
                     m = re.search('(.*?)' + p + '.*\.', plaintext(r.txt),
                                   re.IGNORECASE)
                 else:
                     m = re.search(p + '(.*)', plaintext(r.txt),
                                   re.IGNORECASE)
                 if m:
                     print plaintext(r.txt)
                     print "-------------------------------------------------"
                     print p, "generated", m.group(1)
                     print "================================================="
                     results.append((m.group(1), w1 + w2))
                     break
     return results
Ejemplo n.º 9
0
def get_patent(url):
    url = URL(url + "/fulltext")
    html = url.download()
    dom = DOM(html)
    title = plaintext(dom('h3 a')[0].content)
    body = plaintext(dom('#contents')[0].content)
    return [title, body]
Ejemplo n.º 10
0
def get_array(sentences, raw_query, work_range, similarity_threshold):
    reply_unsorted = []
    reply_sorted = []
    sort_linker = {}
    sort_values = []
    index = 0
    for sentence in sentences[work_range['start']:work_range['end']]:
        similarity_value = symmetric_sentence_similarity(sentence, raw_query)
        if similarity_value > similarity_threshold:
            sort_values.append(similarity_value)
            try:  # remove because it holds meaning no more.
                reply_unsorted.append(sentence.encode('utf-8'))  # old
                sort_linker[similarity_value] = sentence.encode('utf-8')  # old
                # `reply_unsorted.append(plaintext(sentence).encode('utf-8'))  # old
                # sort_linker[similarity_value] = plaintext(sentence).encode('utf-8') # old
                # reply_unsorted.append(str(plaintext(sentence)))   # oldest
                # sort_linker[similarity_value] = str(plaintext(sentence))  # oldest
            except (UnicodeDecodeError, UnicodeEncodeError):
                reply_unsorted.append(plaintext(sentence).encode('utf-8'))
                sort_linker[similarity_value] = plaintext(sentence).encode(
                    'utf-8')
            finally:
                index += 1

    sort_values = sorted(sort_values,
                         reverse=False)  # alternatively sort_values.sort()
    reply_sorted = [sort_linker[index] for index in sort_values]

    return reply_sorted
Ejemplo n.º 11
0
def main():
	table = Datasheet()
	tel = ''
	street = ''
	locality = ''
	title = ''
	for i in range(3):
		page = i+1
		url = 	URL("http://torino.paginegialle.it/pgol/4-veterinari/3-torino/p-%s?mr=50" % page)
		print "collecting from %s" % url
		connection = url.open()
		doc = Document( connection.read() )
		items = doc.by_class('item_sx')
		row = []
		for j, item in enumerate(items):
			divs = item.by_class('address')
			try:	
				title = item.by_class('item_head')[0].by_tag('a')[0].content
			except IndexError, e:
				print >> sys.stderr, "%s" % j, e
				pass
			for z, div in enumerate(divs):
				if div != None:
					try:
						street = div.by_class('street-address')[0].content
						locality = div.by_class('locality')[0].content
						tel = div.by_class('tel')[0].by_class('value')[0].content
					except IndexError, e:
						print >> sys.stderr, "%s" % z, e
						pass
					save = "%s, %s %s, %s \n" % ( plaintext(title), plaintext(street).replace(",", ""), plaintext(locality).replace('(TO)', ''), plaintext(tel).replace(",", "") )
					print >> sys.stderr, save
					row.append(save)
Ejemplo n.º 12
0
 def plainTextConverter(self, link, metodo="SinEtiquetas"):
     reload(sys)
     sys.setdefaultencoding('utf-8')
     url = URL(link)
     txtContent = ""
     try:
         if url.mimetype in MIMETYPE_PDF:
             document = open('temp.pdf', 'w')
             document.close()
             download = url.download()
             document = open('temp.pdf', 'a')
             document.write(download)
             document.close()
             #txtContent=os.system('pdf2txt.py temp.pdf')
             txtContent = commands.getoutput('pdf2txt.py temp.pdf')
         else:
             page = URL(url).download(user_agent='Mozilla/5')
             if metodo == "mantenerEtiquetas":
                 txtContent = plaintext(page,
                                        keep={
                                            'title': [],
                                            'h1': [],
                                            'h2': [],
                                            'strong': []
                                        })
             else:
                 txtContent = plaintext(page, keep={})
     except:
         pass
     return txtContent
Ejemplo n.º 13
0
    def exctract_values(self, dom, myInfo):
        for a in dom.by_tag("a.denomination-links pj-lb pj-link"
                            ):  # First <a class="title"> in entry.
            myInfo.names.append(self.decode_if_unicode(plaintext(a.content)))

        #adresses
        for a in dom.by_tag("a.adresse pj-lb pj-link"
                            ):  # First <a class="title"> in entry.
            myInfo.adresses.append(self.decode_if_unicode(plaintext(
                a.content)))
            numbers = re.findall(r'\d+',
                                 self.decode_if_unicode(plaintext(a.content)))
            myInfo.districts.append(numbers[-1])

        #telephones
        for a in dom.by_tag("ul.main-contact-container clearfix"
                            ):  # First <a class="title"> in entry.
            contact = []
            for e in a.by_tag("div.tel-zone noTrad"):
                contact.append(
                    self.decode_if_unicode(plaintext(e.content))[-14:])
                '''
				telephone_number=re.findall(r'\d+',self.decode_if_unicode(plaintext(a.content)))
				telephone_string=""
				for s in telephone_number:
					telephone_string=telephone_string+str(s)
				contact.append(telephone_string)
				'''
            myInfo.contacts.append(contact)
Ejemplo n.º 14
0
def loadPage(numPage):
    #Load the content from the given page
    url = URL(URL_YAHOO + str(numPage))
    dom = DOM(url.download(cached=True))
    for row in dom(ROWS_PATH)[1:]:
        #pprint.pprint(plaintext(row(CELLS_PATH)[0].content))
        TICKETS.append({"symbol": plaintext(row(CELLS_PATH)[0].content), "name": plaintext(row(CELLS_PATH)[1].content) })
    pprint.pprint(str(numPage + 1) + "/" + str(NUM_PAGES))
Ejemplo n.º 15
0
def blogsData(blogs,time,db):		
	
	d = feedparser.parse(blogs)	
	
	dicS = {}
	dicS['rssLink'] = blogs
	dicS['titleBlog'] = d['feed']['title'].encode('utf-8').replace("\xe2\x80\x99","'")
	dicS['descriptionBlog'] = d['feed']['description'].encode('utf-8').replace("\xe2\x80\x99","'")
	dicS['updated'] = d['feed']['updated'].encode('utf-8')	
	db.sources.update({"titleBlog":dicS['titleBlog']},dicS,upsert=True)
		
	for item in d['entries']:
		dic = {}
		dic['titlePost'] = item.title.encode('utf-8').replace("\xe2\x80\x99","'")
		if db.articles.find({"titlePost":dic['titlePost']}).count() == 0:
			dic['titleBlog'] = d['feed']['title'].encode('utf-8').replace("\xe2\x80\x99","'")
			dic['descriptionBlog'] = d['feed']['description'].encode('utf-8').replace("\xe2\x80\x99","'")		
			dic['link'] = item.link.encode('utf-8')
			dic['date']=datetime.datetime.utcnow()
			if "author" in item:
				dic['author'] = item.author.encode('utf-8').replace("(","").replace(")","")
			#Detag content, define parsing rules for outfits and
			#set nltk process
			if "content" in item:
				text = plaintext(item.content[0]['value'])
				dic['content'] = text.encode('utf-8').replace("\xe2\x80\x99","'").replace("\n"," ").replace(":"," ")
				dom = DOM(item.content)
				imagesUrl = []
				for e in dom('img'):
					imagesUrl.append(e.attributes.get('src','').encode('utf-8'))
				dic['images'] = imagesUrl
				dic['entities']=nlpModules.extract_entities_regex(dic['titlePost']+" "+text.encode('utf-8'))
				sentiment=nlpModules.sentimentAnalysis(text.encode('utf-8'))
				dic['sentimentScore']=sentiment[0]
				dic['sentimentCategory']=sentiment[1]
			elif item.summary:
				text = plaintext(item.summary)
				dic['content'] = text.encode('utf-8').replace("\xe2\x80\x99","'").replace("\n"," ").replace(":"," ")
				dom = DOM(item.description)
				imagesUrl = []
				for e in dom('img'):
					imagesUrl.append(e.attributes.get('src','').encode('utf-8'))
				dic['images'] = imagesUrl
				dic['entities']=nlpModules.extract_entities_regex(dic['titlePost']+". "+text.encode('utf-8'))
				sentiment=nlpModules.sentimentAnalysis(text.encode('utf-8'))
				dic['sentimentScore']=sentiment[0]
				dic['sentimentCategory']=sentiment[1]
			if item.published:
				dic['published'] = item.published.encode('utf-8')			
			if "tags" in item:
				tags = []
				for tag in  item.tags:
					tags.append(tag.term.encode('utf-8'))
				dic['tags'] = tags	
			#db.articles.insert(dic)
			db.articles.update({"titlePost":dic['titlePost']},dic,upsert=True)
Ejemplo n.º 16
0
def extract_tvseries(dom):
    '''
    Extract a list of highest ranking TV series from DOM (of IMDB page).

    Each TV series entry should contain the following fields:
    - TV Title
    - Ranking
    - Genres (comma separated if more than one)
    - Actors/actresses (comma separated if more than one)
    - Runtime (only a number!)
    '''

    # ADD YOUR CODE HERE TO EXTRACT THE ABOVE INFORMATION ABOUT THE
    # HIGHEST RANKING TV-SERIES
    # NOTE: FOR THIS EXERCISE YOU ARE ALLOWED (BUT NOT REQUIRED) TO IGNORE
    # UNICODE CHARACTERS AND SIMPLY LEAVE THEM OUT OF THE OUTPUT.


    series = []

    # Loops over top 5 imdb series
    for index in dom.by_tag("tr.even detailed")[:5]:

        actors = []
        genres = []
        serie = []

        # Extracts the required fields of the html
        for td in index.by_tag("td.number")[:1]:
            ranking = unicode(plaintext(td.content)) # Extract ranking
        for td in index.by_tag("td.title")[:1]:
            for a in td.by_tag("a")[:1]:
                title = unicode(plaintext(a.content)) # Extract title
            for span in td.by_tag("span.credit")[:1]:
                for a in span.by_tag("a"):
                    actors.append(unicode(plaintext(a.content))) # Extract actors
            for span in td.by_tag("span.genre")[:1]:
                for a in span.by_tag("a"):
                    genres.append(unicode(plaintext(a.content))) # Extract genres
            for span in td.by_tag("span.runtime")[:1]:
                runtime = unicode(plaintext(span.content)) # Extract runtime with minute
                runtime_split = split_string(runtime, ' ') # Split number from minute
                runtime_num = runtime_split[0]

        # append required fields to serie list
        serie.append(title)
        serie.append(ranking)
        serie.append(genres)
        serie.append(actors)
        serie.append(runtime_num)

        # appends serie to series
        series.append(serie)
        
    return series
def inflect(word, language="italian"):

    inflections = {}
    url = "http://en.wiktionary.org/wiki/" + word.replace(" ", "_") 
    dom = DOM(URL(url).download(throttle=10, cached=True))

    pos = ""

    # Search the header that marks the start for the given language:
    # <h2><span class="mw-headline" id="Italian">Italian</span></h2>

    e = dom("#" + language)[0].parent

    while e is not None: # e = e.next_sibling

        if e.type == "element":

            if e.tag == "hr": # Horizontal line = next language.
                break

            if e.tag == "h3": # <h3>Adjective [edit]</h3>
                pos = plaintext(e.content.lower())
                pos = pos.replace("[edit]", "").strip()[:3].rstrip("ouer") + "-"

            # Parse inflections, using regular expressions.

            s = plaintext(e.content)

            # affetto m (f affetta, m plural affetti, f plural affette)

            if s.startswith(word):

                for gender, regexp, i in (
                  ("m" , r"(" + word + r") m", 1),
                  ("f" , r"(" + word + r") f", 1),
                  ("m" , r"(" + word + r") (mf|m and f)", 1),
                  ("f" , r"(" + word + r") (mf|m and f)", 1),
                  ("m" , r"masculine:? (\S*?)(,|\))", 1),
                  ("f" , r"feminine:? (\S*?)(,|\))", 1),
                  ("m" , r"(\(|, )m(asculine)? (\S*?)(,|\))", 3),
                  ("f" , r"(\(|, )f(eminine)? (\S*?)(,|\))", 3),
                  ("mp", r"(\(|, )m(asculine)? plural (\S*?)(,|\))", 3),
                  ("fp", r"(\(|, )f(eminine)? plural (\S*?)(,|\))", 3),
                  ( "p", r"(\(|, )plural (\S*?)(,|\))", 2),
                  ( "p", r"m and f plural (\S*?)(,|\))", 1)):
                    m = re.search(regexp, s, re.I)
                    if m is not None:
                        # {"adj-m": "affetto", "adj-fp": "affette"}
                        inflections[pos + gender] = m.group(i)

            #print s

         e = e.next_sibling

    return inflections
Ejemplo n.º 18
0
def blogsData(blogs):	

	d = feedparser.parse(blogs)
	db = get_db('dev-itoutfits')
	index_name = 'content'
	conn = get_db_es(index_name)
	
	for item in d['entries']:
		dic = {}		
		dic['titlePost'] = item.title.encode('utf-8').replace("\xe2\x80\x99","'")
		num =db.content.find({"titlePost":dic['titlePost']}).count()
		if num == 0:
			#print dic['titlePost'], num
			dic['titlePostUrl'] = item.title.encode('utf-8').replace("\xe2\x80\x99","'").replace(" ", "")
			dic['titleBlogUrl'] = d['feed']['title'].encode('utf-8').replace("\xe2\x80\x99","'").replace(" ", "")
			dic['rssLink'] = blogs
			dic['titleBlog'] = d['feed']['title'].encode('utf-8').replace("\xe2\x80\x99","'")
			dic['updated'] = d['feed']['updated'].encode('utf-8')		
			dic['descriptionBlog'] = d['feed']['description'].encode('utf-8').replace("\xe2\x80\x99","'")				
			dic['link'] = item.link.encode('utf-8')
			dic['date']=datetime.datetime.utcnow()
			#Detag content, define parsing rules for outfits and
			#set nltk process
			if item.content[0]:
				text = plaintext(item.content[0]['value'])
				dic['content'] = text.encode('utf-8').replace("\xe2\x80\x99","'").replace("\n"," ").replace(":"," ")
				dom = DOM(item.content)
				imagesUrl = []
				for e in dom('img'):
					imagesUrl.append(e.attributes.get('src','').encode('utf-8'))
				dic['images'] = imagesUrl
			elif item.description:
				text = plaintext(item.description)
				dic['content'] = text.encode('utf-8').replace("\xe2\x80\x99","'").replace("\n"," ").replace(":"," ")
				dom = DOM(item.description)
				imagesUrl = []
				for e in dom('img'):
					imagesUrl.append(e.attributes.get('src','').encode('utf-8'))
				dic['images'] = imagesUrl
			if item.published:
				dic['published'] = item.published.encode('utf-8')
			
			if "tags" in item:
				tags = []
				for tag in  item.tags:
					tags.append(tag.term.encode('utf-8'))
				dic['tags'] = tags	
			try:
				#print dic		
				db.content.update({"titlePost":dic['titlePost']},dic,upsert=True)
				#type_name="article"
				#conn.index(json.dumps(dic), index_name, type_name)
			except ValueError:
				pass
Ejemplo n.º 19
0
def extract_tvseries(dom):
    '''
    Extract a list of highest rated TV series from DOM (of IMDB page).

    Each TV series entry should contain the following fields:
    - TV Title
    - Rating
    - Genres (comma separated if more than one)
    - Actors/actresses (comma separated if more than one)
    - Runtime (only a number!)
    '''
    
    # ADD YOUR CODE HERE TO EXTRACT THE ABOVE INFORMATION ABOUT THE
    # HIGHEST RATED TV-SERIES
    # NOTE: FOR THIS EXERCISE YOU ARE ALLOWED (BUT NOT REQUIRED) TO IGNORE
    # UNICODE CHARACTERS AND SIMPLY LEAVE THEM OUT OF THE OUTPUT.
    tv_series =[]
    series_list = []
    genre_list =[]
    runtime_list = []
    rating_list= []
    table = []
    actor_row = []
    actors_list = []
    
    
    for e in dom.get_elements_by_tagname("div.lister-item-content")[:20]: # Top 20 entries
        for h3 in e('h3 a'):
            series_list.append(encode(plaintext(h3.content)))
            
        for p in e.get_elements_by_tagname("p.text-muted"):
            for span in e.get_elements_by_tagname("span.runtime"):     # Runtime
                runtime_list.append(plaintext(span.content)[:2])
            for span in e.get_elements_by_tagname("span.genre"):        # Genre
                genre_list.append(plaintext(span.content))
                
        for div in e.get_elements_by_tagname("div.ratings-bar"):
            for div in e.get_elements_by_tagname("div.inline-block ratings-imdb-rating"):
                rating_list.append(plaintext(div.content))                            #rating
        
        for p in e('p a'):      # for loop for scraping actors
            s = " "
            actor_row.append(plaintext(p.content))  # actor names are separated strings, this joins the string into one element for the table
            
        actors_list.append(s.join(actor_row))
        actor_row [:] = []
         

    for series, rating, genre, actors, runtime in zip(series_list, rating_list, genre_list, actors_list, runtime_list):
        table = (series, rating, genre, actors, runtime)
        tv_series.append(table)         # creating the table with all information
    
    return tv_series
Ejemplo n.º 20
0
def extract_tvseries(dom):
    '''
    Extract a list of highest ranking TV series from DOM (of IMDB page).

    Each TV series entry should contain the following fields:
    - TV Title
    - Ranking
    - Genres (comma separated if more than one)
    - Actors/actresses (comma separated if more than one)
    - Runtime (only a number!)
    '''

    # ADD YOUR CODE HERE TO EXTRACT THE ABOVE INFORMATION ABOUT THE
    # HIGHEST RANKING TV-SERIES
    # NOTE: FOR THIS EXERCISE YOU ARE ALLOWED (BUT NOT REQUIRED) TO IGNORE
    # UNICODE CHARACTERS AND SIMPLY LEAVE THEM OUT OF THE OUTPUT.

    tvseries = []

    for table_row in dom.by_tag("tr")[1:51]: # The first table row is redundant, so start from index 1.
        
        # Default values in case something is missing.
        title = "-"
        rating = "-"
        actors = "-"
        genre = "-"
        runtime = "-"

        for table_cell in table_row.by_tag("td.title"):
            for a in table_cell.by_tag("a")[:1]:
                # Obtain the title.
                title = unicode(plaintext(a.content))
            for rating_span in table_cell.by_tag("span.rating-rating"):
                # Obtain the rating.
                rating = unicode(plaintext(rating_span.content))
                rating = rating.split("/")[0]
            for credit_span in table_cell.by_tag("span.credit"):
                # Obtain the actors/actresses.
                actors = unicode(plaintext(credit_span.content))
                actors = actors.split(": ")[1]
            for genre_span in table_cell.by_tag("span.genre"):
                # Obtain the genre(s).
                genre = unicode(plaintext(genre_span.content))
                genre = ", ".join(genre.split(" | "))
            for runtime_span in table_cell.by_tag("span.runtime"):
                # Obtain the runtime.
                runtime = unicode(plaintext(runtime_span.content))
                runtime = runtime.split(" ")[0]

            tvseries_item = [title, rating, genre, actors, runtime]
            tvseries.append(tvseries_item)

    return tvseries
Ejemplo n.º 21
0
def get_craigslist_postings():
    postings = json.load(open('results.json', 'r'))

    # filter cross postings
    filtered = {}
    for boat_name, posts in postings.items():
        founds = []
        urls = []
        for post in posts:
            _hash = post.split('/')[-2]
            if _hash not in founds:
                founds.append(_hash)
                urls.append(post)
        filtered[boat_name] = urls


    for boat_name, posts in filtered.items():
        for post in posts:
            r = requests.get(post)
            w = web.Element(r.content)
            body = w.by_id('postingbody')
            content = body.content
            links = body.by_tag('a')
            if links:
                content = content.replace(links[0].content, '')
            c = w.by_class("print-qrcode-label")
            if c:
                content = content.replace(c[0].content, '')
            
            content = web.plaintext(content)

            formatted_attrs = {}
            attrs = w.by_class('attrgroup')[0].by_tag('span')
            for attr in attrs:
                values = web.plaintext(attr.content).split(': ')
                if len(values) == 2:
                    key = values[0].replace(' ', '_').replace('/','')
                    value = values[1]
                    formatted_attrs[key] = value
            price = web.plaintext(w.by_class('price')[0].content.replace('$',''))
            post_data = {
                'body': {
                    'source': content,
                    'link': post,
                    'attrs': formatted_attrs,
                    'boat': boat_name,
                    'price': price
                },
                "index": "listings",
                'doc_type': 'listing'
            }
            res = es.index(**post_data)
Ejemplo n.º 22
0
def extract_tvseries(dom):
    '''
    Extract a list of highest ranking TV series from DOM (of IMDB page).

    Each TV series entry should contain the following fields:
    - TV Title
    - Ranking
    - Genres (comma separated if more than one)
    - Actors/actresses (comma separated if more than one)
    - Runtime (only a number!)
    '''
    tv_series = []
    
    
    for e in dom.by_tag("td.title")[:250]: #amount of titles
        tv_serie = []
        for a in e.by_tag("a")[:1]: #search for title
            title = plaintext(a.content)
            tv_serie.append(title)
            print title
            
            
        for value in e.by_class("value"): #search for ranking
            value = plaintext(value.content)
            tv_serie.append(value)
            
        for genre in e.by_class("genre"): #search for genre
            genre = plaintext(genre.content)
            genre = genre.replace(' | ',',')
            tv_serie.append(genre)
            
        for credit in e.by_class("credit"): #search for actors/actresses
            credit = plaintext(credit.content)
            credit = credit.replace('With: ','')
            credit = credit.replace(', ',',')
            tv_serie.append(credit)
            
        for runtime in e.by_class("runtime"): #search for runtime
            runtime = plaintext(runtime.content)
            runtime = runtime.replace(' mins.','')
            tv_serie.append(runtime)
        tv_series.append(tv_serie)




    # ADD YOUR CODE HERE TO EXTRACT THE ABOVE INFORMATION ABOUT THE
    # HIGHEST RANKING TV-SERIES
    # NOTE: FOR THIS EXERCISE YOU ARE ALLOWED (BUT NOT REQUIRED) TO IGNORE
    # UNICODE CHARACTERS AND SIMPLY LEAVE THEM OUT OF THE OUTPUT.

    return tv_series  # replace this line as well as appropriate
Ejemplo n.º 23
0
def extract_tvseries(dom):
    '''
    Extract a list of highest ranking TV series from DOM (of IMDB page).

    Each TV series entry should contain the following fields:
    - TV Title
    - Ranking
    - Genres (comma separated if more than one)
    - Actors/actresses (comma separated if more than one)
    - Runtime (only a number!)
    '''

    # ADD YOUR CODE HERE TO EXTRACT THE ABOVE INFORMATION ABOUT THE
    # HIGHEST RANKING TV-SERIES
    # NOTE: FOR THIS EXERCISE YOU ARE ALLOWED (BUT NOT REQUIRED) TO IGNORE
    # UNICODE CHARACTERS AND SIMPLY LEAVE THEM OUT OF THE OUTPUT.
    
      
    komma = ", "    # komma om tussen acteurs en genres te zetten
    imdb = []       # imdb als list
    
    for data in dom.by_tag("td.title"):  # zoek in table gegevens van serie
        gegevens = []
        title = data.by_tag("a")[0]                    # zoek naar eerste a tag, want dat is de titel
        title.content.encode('ascii', 'ignore') # voorkomt unicode error
        gegevens.append(title.content)
                
        rating = data.by_tag("span.value")[0]        # zoek naar ratings
        gegevens.append(plaintext(rating.content))  # voegt titel aan gegevens toe
                
        for genres in data.by_tag("span.genre"):    
            soort = []                                  # maakt list soort
            for genre in genres.by_tag("a"):            # zoek naar genre
                soort.append(plaintext(genre.content))  # voegt genres toe aan soort
            seq = komma.join(soort)                     # maakr van list een string
            gegevens.append(seq)                        # voegt genre string aan gegevens
                
        for actors in data.by_tag("span.credit"):
            acteurs = []                                # maak list acteurs 
            for actor in actors.by_tag("a"):            # zoek naar acteurs
                actor.content.encode('ascii', 'ignore')         # voorkomt unicode error
                acteurs.append(plaintext(actor.content))# voegt gevonden acteurs aan list
            sq = komma.join(acteurs)                    # maakt van list string
            gegevens.append(sq)                         # voegt acteur string aan gegegevens toe
                
        runtime = data.by_tag("span.runtime")[0]    # zoek naar runtime
       
        gegevens.append(runtime.content.partition(' ')[0]) # voegt runtime aan gegevens toe 
               
        imdb.append(gegevens)   # voegt de list van gegevens aan list imdb
            
    return imdb # return list 
Ejemplo n.º 24
0
def extract_tvseries(dom):
    '''
    Extract a list of highest ranking TV series from DOM (of IMDB page).

    Each TV series entry should contain the following fields:
    - TV Title
    - Ranking
    - Genres (comma separated if more than one)
    - Actors/actresses (comma separated if more than one)
    - Runtime (only a number!)
    '''

    # Create list for series
    series = []

    # Loop over the series and substract the needed information
    for e in dom.by_tag(".title"):
        serie = []

        # Substract title of the serie
        serie.append(plaintext(e.by_tag('a')[0].content).encode('utf-8'))

        # Substract ranking of the serie
        serie.append(plaintext(e.by_tag(".value")[0].content).encode('utf-8'))

        # Substract genres of the serie
        genres = []
        for a in e.by_tag(".genre")[:1]:
            for b in a.by_tag("a"):
                genres.append(plaintext(b.content).encode('utf-8'))
        serie.append(', '.join(genres))

        # Substract actors of the serie
        actors = []
        for a in e.by_tag("span.credit")[:1]:
            for b in a.by_tag("a"):
                actors.append(plaintext(b.content).encode('utf-8'))
        serie.append(', '.join(actors))

        # If runtime is known: append runtime of serie
        # If runtime is not known: set runtime to zero
        if e.by_tag('.runtime'):
            serie.append(
                plaintext(e.by_tag('.runtime')[0].content).replace(
                    ' mins.', "").encode('utf-8'))
        else:
            serie.append('0')
        series.append(serie)

    # Returns a list of series
    return series
Ejemplo n.º 25
0
def extract_tvseries(dom):
    '''
    Extract a list of highest ranking TV series from DOM (of IMDB page).

    Each TV series entry should contain the following fields:
    - TV Title
    - Ranking
    - Genres (comma separated if more than one)
    - Actors/actresses (comma separated if more than one)
    - Runtime (only a number!)
    '''

    # ADD YOUR CODE HERE TO EXTRACT THE ABOVE INFORMATION ABOUT THE
    # HIGHEST RANKING TV-SERIES
    # NOTE: FOR THIS EXERCISE YOU ARE ALLOWED (BUT NOT REQUIRED) TO IGNORE
    # UNICODE CHARACTERS AND SIMPLY LEAVE THEM OUT OF THE OUTPUT.
    dom = DOM(TARGET_URL.download(cached=True))
    # Get top 50 results
    for e in dom.by_tag("td.title"):
        # get title
        for a in e.by_tag("a")[:1]:
            title = plaintext(a.content)
            print title
            print

        # get ranking
        for td in e.by_tag("span.value")[:1]:
            ranking = plaintext(td.content)
            print ranking
            print

        # get genre
        for span in e.by_tag("span.genre")[:1]:
            genre = plaintext(span.content)
            print genre
            print

        # get actors/actresses
        for span in e.by_tag("span.credit")[:1]:
            actors = plaintext(span.content)
            print actors
            print

        # get runtime (number)
        for span in e.by_tag("span.runtime")[:1]:
            runtime = plaintext(span.content)
            print runtime
            print

        # create a dictionary of all the retrieved info
        showlist[e] = {title, ranking, genre, actors, runtime}
Ejemplo n.º 26
0
 def __init__(self, data, url="", contenidoBd=""):
     if url != "":
         urlContent = UrlToPlainText()
         self.contenidoConEtiquetas = urlContent.plainTextConverter(
             url, "mantenerEtiquetas")
         self.contenido = plaintext(self.contenidoConEtiquetas, keep={})
     else:
         if (contenidoBd != ""):
             self.contenidoConEtiquetas = contenidoBd
             self.contenido = plaintext(self.contenidoConEtiquetas, keep={})
         else:
             self.contenido = ""
     self.data = count(words(Sentence(parse(self.contenido))),
                       stemmer=PORTER)
 def descargarContenido(self, url):
     """Metodo para descargar el contenido de los documentos webs siendo url o pdf"""
     try:
         unaUrl = URL(url)
         if "pdf" in extension(unaUrl.page):
             return self.descargarPDF(unaUrl)
         else:
             return plaintext(unaUrl.download())
     except Exception as e:
         try:
             return plaintext(self.urlLibDescarga(url))
         except Exception as e:
             print "except " + str(e)
             print url
Ejemplo n.º 28
0
def extract_tvseries(dom):
    '''
    Extract a list of highest rated TV series from DOM (of IMDB page).
    Each TV series entry should contain the following fields:
    - TV Title
    - Rating
    - Genres (comma separated if more than one)
    - Actors/actresses (comma separated if more than one)
    - Runtime (only a number!)
    '''

    series = []

    # The parent of all fields
    super_parent = dom.by_tag("div.lister-item-content")
    for item in super_parent:

        # The genre(s).
        genre = plaintext(item.by_tag("span.genre")[0].content)
        print genre

        # The runtime
        runtime = plaintext(item.by_tag("span.runtime")[0].content)
        print runtime

        # The rating.
        rating = plaintext(item.by_tag("span.lister-item-index")[0].content)
        print rating

        # The title.
        title = plaintext(
            item.by_tag("a")[0].content.encode('ascii', 'ignore'))
        print title

        # The actors
        actors = plaintext(
            item.by_tag("p.")[2].content.encode('ascii', 'ignore'))
        actors = actors[6:]
        print actors

        print "_______________________________"
        series.append([title, rating, genre, actors, runtime])

    # ADD YOUR CODE HERE TO EXTRACT THE ABOVE INFORMATION ABOUT THE
    # HIGHEST RATED TV-SERIES
    # NOTE: FOR THIS EXERCISE YOU ARE ALLOWED (BUT NOT REQUIRED) TO IGNORE
    # UNICODE CHARACTERS AND SIMPLY LEAVE THEM OUT OF THE OUTPUT.

    return series  # replace this line as well as appropriate
Ejemplo n.º 29
0
def extract_tvseries(dom):
    '''
    Extract a list of highest ranking TV series from DOM (of IMDB page).

    Each TV series entry should contain the following fields:
    - TV Title
    - Ranking
    - Genres (comma separated if more than one)
    - Actors/actresses (comma separated if more than one)
    - Runtime (only a number!)
    '''
    # Trying to make a replacer for some symbols
    #replaceLetters = ["î","û","ô","ê","â"]

    # Making the list in which we store all the data
    showData = []

    for e in dom.by_tag(
            "td.title")[:50]:  # This lets us select the individual shows
        # Here we store the data for every individual show, so we can add it to the overall list
        show = []
        # We now get the data for individual shows, and we add every attribute to our list
        for titles in e.by_tag("a")[:1]:  # Title of a series
            title = plaintext(titles.content)
            #            for ch in replaceLetters:
            #                if ch in title:
            #                    title = title.replace(ch,'')
            show.append(title)
        for ratings in e.by_class("value"):  # Rating for a series
            rating = plaintext(ratings.content)
            show.append(rating)
        for genres in e.by_class("genre"):  # Genre of a series
            genre = plaintext(genres.content)
            genre = genre.replace(" | ", ',')  # Cleaning our output
            show.append(genre)
        for names in e.by_class("credit"):  # Main actors of a series
            name = plaintext(names.content)
            name = name.replace("With: ", '')  # Cleaning our output
            show.append(name)
        for runtimes in e.by_class("runtime"):  # Runtime of a series
            runtime = plaintext(runtimes.content)
            runtime = runtime.replace("mins.", '')  # Cleaning our output
            show.append(runtime)
        # Adding all the data of this individual show to the total list
        showData.append(show)

    # Returning our complete list of all the data for the shows
    return showData
Ejemplo n.º 30
0
def extract_tvseries(dom):
    # dom = DOM (url.download(cached=True))

    '''
    Extract a list of highest ranking TV series from DOM (of IMDB page).

    Each TV series entry should contain the following fields:
    - TV Title
    - Ranking
    - Genres (comma separated if more than one)
    - Actors/actresses (comma separated if more than one)
    - Runtime (only a number!)
    '''
    # upcoming 2 for loops implemented from the 12-dom.py example in the pattern library

    # Create lists and strings to store the data

    tvseries = []


    # In the IMDB HTML(tree), look for the td class 'title' which contains all the needed information
    for e in dom.by_tag("td.title")[:50]:
        Movie = []
        # In the branch 'a' get the title of the movie
        for title in e.by_tag("a")[:1]:
            # Add the title name to the string 'Titles'
            title = plaintext(title.content).encode('utf-8')
            Movie.append(title)
        # In the branch 'ranking', get the ranking of the movie
        for ranking in e.by_tag("div.user_rating"):
            # Add the rankings to the string 'Rankings'
            Movie.append(str(plaintext(ranking.content))[11:-4])
        # Create list with Genres, comma seperated (string)
        for genre in e.by_tag("span.genre"):
            Movie.append(str(plaintext(genre.content)))
            # Replace separator | with ,
            # genre = genre.replace('|', ',')
        # Create list with Actors, comma seperated (string)
        for actor in e.by_tag("span.credit"):
            actor = plaintext(actor.content).encode('utf-8')[6:]
            Movie.append(actor)
        # Create list with Runtime (numeric)
        for time in e.by_tag("span.runtime"):
            Movie.append(int(str(plaintext(time.content))[:2]))

        tvseries.append(Movie)

    return tvseries
 def get_title_index(self, dom, plain_text):
     title_el = dom('title')
     if title_el:
         # get title text
         title = plaintext(title_el[0].source)
         # usually title in <title> tag is a little bit different from
         # acutal article title - publishers tend to add website name or author name either to the
         # end or to the beginning of the title, for example:
         #
         # Insurers: Despite deadline, Obamacare glitches persist - CNN.com
         # India's Dating Sites Skip Straight to the Wedding - P. Nash Jenkins - The Atlantic
         #
         # But it will be separated by either : or -, so let's take substring
         # two first such separators
         title_parts = re.split('\:|\-|\|', title)
         # find title in tagless text
         # part_idx = 1 if len(title_parts) > 2 else 0
         part_idx = 0
         part_len = len(title_parts[0])
         for idx, part in enumerate(title_parts):
             if len(part) > part_len:
                 part_idx = idx
                 part_len = len(part)
         title_idx = [m.start() for m in re.finditer(title_parts[part_idx].strip(), plain_text)]
         return title_idx
     return []
Ejemplo n.º 32
0
    def getTwits(self, keyWord):
        if len(keyWord) == 0:
            keyWord = u'"gündem"'
            self.lineEdit.setText(keyWord)
        self.alText = u''
        try:
            tList = self.twitter.search(keyWord,
                                        start=self.prevId,
                                        count=10,
                                        cached=False)

        except:
            message = "Twitter Aram Limiti Lütfen Biraz Bekleyin"
            QtGui.QMessageBox.information(self.dialog, "Information",
                                          "Python rocks!")

        for tweet in tList:
            self.listWidget.addItem(
                QtGui.QListWidgetItem(cleanTweet(tweet.text)))
            self.twIds.append(tweet.id)
            self.listWidget.setCurrentRow(self.listWidget.count() - 1)
            tweet.text = self.filterRT(tweet.text)
            tweet.text = self.filterLink(tweet.text)
            self.alText = self.alText + plaintext(tweet.text) + u' '
            self.prevId = tweet.id
def get_artist_docs(name):

    default_dir = basedir + name
    rap_docs = ""

    # get a list of all the files in default dir
    for f in os.listdir(default_dir):
        # go to that dir
        os.chdir(default_dir)
        # open the file
        fi = open(f, 'r')
        # print "reading " + f
        # slurp
        page = fi.read()

        # what does this do?
        dom = DOM(page)

        # we look at the page and get that the thing we want is in the .lyrics div.
        if dom and dom('.lyrics'):
            lyrics = dom('.lyrics')[0]
        else:
            continue

        p = plaintext(lyrics.content)
        rap_docs += p

    return rap_docs
Ejemplo n.º 34
0
def search_bbc(result, term, howmany, rurl, page):
    # convenient text only search
    if not rurl == None and "www.bbc.co.uk" in rurl:
        query = rurl  # for pagination support
    else:
        query = "http://www.bbc.co.uk/search/news/" + term.lower() + "?text=on"

    dom = get_dom(query)

    if "BBC" not in result:
        result["BBC"] = []

    for a in dom.by_class("title"):
        title = plaintext(a.content)
        link = a.attributes["href"]
        content = heuristic_scrape(link)
        score = sentiment(content)
        if len(result["BBC"]) < howmany:
            result["BBC"].append((title, "NEG" if score < 0 else "POS"))
            print "BBC", title, link, "NEG" if score < 0 else "POS"
            if len(result["BBC"]) % 20 == 0 and len(result["BBC"]) < howmany:  # 20 articles per page
                print "flip"
                next_page = (
                    "http://www.bbc.co.uk/search/news/"
                    + term
                    + "?page="
                    + str(page + 1)
                    + "&text=on&dir=fd&news="
                    + str(len(result["BBC"]) + 1)
                    + "&news_av=1"
                )
                return search_bbc(result, term, howmany, next_page, page + 1)
        else:
            break
    return result
Ejemplo n.º 35
0
def load_support():
    '''
    Loads support / oppose data from political speeches dataset into memory as two lists: one text and the other
    binary 1 = support, 0 = no oppose.
    
    This function groups the presorted testing and training groups into one set.
    
    Output:
        quotes (text data), support (binary data)
    
    '''
    
    quotes, support = [], []
    
    for path in ['../Data/Training_Data/support_oppose/data_stage_one/training_set/', 
                 '../Data/Training_Data/support_oppose/data_stage_one/test_set/']:
        # load all quotes
        for filename in os.listdir(path):
            if(re.search(u'.txt', filename) is not None):
                with open(path+filename) as f:
                    data="".join(line.rstrip() for line in f)
                quotes.append(plaintext(data))
                
                # determine if last letter is a Y or N for support binary data
                if filename.split('.txt')[0][-1] == 'Y':
                    support.append(1)
                else:
                    support.append(0)
    
    return quotes, np.array(support)
Ejemplo n.º 36
0
def obtain_names(div_element):
    '''
    Scrapes a <div> element from a movie's web page. The <div> element must contains names.

    On a movie's web page, names of directors, writers, and actors are all in a 
    separate <div>, for example (for directors):
    -------------------------------------------------------------------------------
    <div class="txt-block" itemprop="director" itemscope itemtype="http://schema.org/Person">
        <h4 class="inline">Director:</h4>
        <a href="/name/nm0001104/?ref_=tt_ov_dr"
        itemprop='url'>
            <span class="itemprop" itemprop="name">Frank Darabont</span>
        </a>
    </div>
    -------------------------------------------------------------------------------
    The snippet above is from the web page of the movie 'The Shawshank Redemption'. 

    Args:
        div_element: pattern.web.Element instance representing a <div>
                    containing names.

    Returns:
        A string with all the names in the <div>, semicolon separated if several.
    '''

    names_list = []
    for span in div_element.by_tag("span.itemprop"):
        name = unicode(plaintext(span.content))
        names_list.append(name)
    return ";".join(names_list)
Ejemplo n.º 37
0
    def research_on(self, what, where):

        url = URL(
            "https://www.pagesjaunes.fr/pagesblanches/recherche?quoiqui=" +
            what + "&ou=" + where + "&proximite=0")
        dom = DOM(url.download(cached=True))

        for a in dom.by_tag("div.main-title pj-on-autoload "):
            for e in a.by_tag("span.denombrement"):
                number_of_results = int(
                    self.decode_if_unicode(plaintext(e.content))[:3])

        number_of_page_results = number_of_results / 20
        if (number_of_results % 20 > 0):
            number_of_page_results += 1

        self.exctract_values(dom, self.myInfo)

        for i in range(2, number_of_page_results + 1):
            url = URL(
                "https://www.pagesjaunes.fr/pagesblanches/recherche?quoiqui=" +
                what + "&ou=" + where + "&proximite=0+"
                "&page=" + str(i))
            dom = DOM(url.download(cached=True))
            self.exctract_values(dom, self.myInfo)

        self.myInfo.sort_and_merge()
Ejemplo n.º 38
0
def process_EN():
  '''Processes the English RSS feeds, locate entities and returns a list of tuples URI, DICT, where DICT contains entry's title, URL, plain text and list of entities.'''
  EN_RSS_LIST = [
    (u'Lifehacker', 'http://feeds.gawker.com/lifehacker/vip'),
    (u'The Verge', 'http://www.theverge.com/rss/index.xml'),
    (u'Zen Habits', 'http://feeds.feedburner.com/zenhabits?format=xml')
    ]
  items = []
  for feed in EN_RSS_LIST:
    feedlist = []
    # fetch the feed
    for result in reader.search(feed[1])[:10]:
      clean_text = plaintext(result.text)
      response = alchemyapi.entities('text', result.text)
      # parse the entities
      entities = []
      for entity in response['entities']:
        if entity.has_key('disambiguated'):
          dbpedia_uri = entity['disambiguated']['dbpedia']
        else:
          dbpedia_uri = None
        entities.append((entity['text'], dbpedia_uri))

      feedlist.append(dict(title=result.title, url=result.url, text=clean_text, entities=entities))
    items.append(dict(site=feed[0], feedlist=feedlist))
  return items
Ejemplo n.º 39
0
def get_artist_docs(name):

    default_dir = basedir + name
    rap_docs = ""

    # get a list of all the files in default dir

    for f in os.listdir(default_dir):
        print f
        # go to that dir
        os.chdir(default_dir)
        # open the file
        fi = open(f, 'r')
        # print "reading " + f
        # slurp
        page = fi.read()

        # what does this do?
        dom = DOM(page)

        # we look at the page and get that the thing we want is in the .lyrics div.
        if dom and dom('.lyrics'):
            lyrics =  dom('.lyrics')[0]
        else:
            continue

        p = plaintext(lyrics.content)
        rap_docs += p

    return rap_docs
Ejemplo n.º 40
0
def gettweets(searchterms):
    tweetlist = []
    from pattern.web import Twitter, plaintext
    twitter = Twitter(language='en')
    for tweet in twitter.search(searchterms, cached=False):
        tweetlist.append(plaintext(tweet.text))
    return tweetlist
Ejemplo n.º 41
0
def search_bbc(result, term, howmany, rurl, page):
    #convenient text only search
    if not rurl == None and 'www.bbc.co.uk' in rurl:
        query = rurl #for pagination support
    else:
        query = 'http://www.bbc.co.uk/search/news/'+term.lower()+'?text=on'

    dom = get_dom(query)

    if 'BBC' not in result:
        result['BBC'] = []
    
    for a in dom.by_class('title'):
        title = plaintext(a.content)
        link = a.attributes['href']
        content = heuristic_scrape(link)
        score = sentiment(content)
        if len(result['BBC']) < howmany:
            result['BBC'].append((title, 'NEG' if score < 0 else 'POS'))
            print 'BBC', title, link, 'NEG' if score < 0 else 'POS'
            if len(result['BBC']) % 20 == 0 and len(result['BBC']) < howmany: #20 articles per page
                print 'flip'
                next_page = 'http://www.bbc.co.uk/search/news/'+term+ \
                            '?page='+str(page+1)+'&text=on&dir=fd&news='+ \
                            str(len(result['BBC'])+1)+'&news_av=1'
                return search_bbc(result, term, howmany, next_page, page+1)
        else:
            break
    return result     
Ejemplo n.º 42
0
def get_around_words(el, text, count=10):
    if el is None:
        return []

    parent = el.parent
    before = []
    after = []
    max_depth = 10
    source = ''
    idx = -1
    while parent is not None and max_depth:
        source = plaintext(parent.source, linebreaks=1).lower().replace('\n', ' ')
        idx = source.find(text)
        after, before = get_words_before_after(source, after, before, idx, text)
        if len(before) >= count and len(after) >= count:
            break
        parent = parent.parent
        max_depth -= 1
    # substitute date and time with recognizable tokens
    norm_source = re.sub(TIME_RE, 'TIME ', source)
    norm_source = re.sub(POINT_IN_TIME_RE, 'TIME ', norm_source)
    norm_source = re.sub(DATE_RE, 'DATE ', norm_source)
    after, before = get_words_before_after(norm_source, after, before, idx, text)

    return before[-count:], after[:count]
Ejemplo n.º 43
0
 def __init__(self,url, query):
     super(documento, self).__init__()
     self.url = url
     self.urlObjet = URL('http://www.clips.ua.ac.be')
     self.html = self.urlObjet.download(user_agent='Mozilla/5.0')
     self.contenido = plaintext(self.html, keep=[], replace=blocks, linebreaks=2, indentation=False)
     self.elemento = Element(self.html)
Ejemplo n.º 44
0
def getWordList(url):
    word_list = []
    #raw data
    #source_code = requests.get(url)
    #convert to text
    #plain_text = source_code.text
    #lxml format
    # soup = BeautifulSoup(plain_text,'lxml')

    htmlString = get(url).text
    webText = plaintext(htmlString)

    #find the words in paragraph tag
    #for text in webText.findAll('p'):
    #if text.text is None:
    #    continue
    #content
    # content = text.text
    #lowercase and split into an array
    words = webText.lower().split()

    #for each word
    for word in words:
        #remove non-chars
        cleaned_word = clean_word(word)
        #if there is still something there
        if len(cleaned_word) > 0:
            #add it to our word list
            word_list.append(cleaned_word)

    return word_list
 def get_elements_with_short_text(self, dom, tag_name, plain_text, title_idx):
     """
     Get all potential candidates elements by tagName. Filter out elements with
     more than 9 words and with distance to title more than 300 characters
     """
     elements = []
     for el in dom.by_tag(tag_name):
         l = 0
         el_plain_text = plaintext(el.source, keep=TAGS)
         title_dist = self.get_distance_to_title(tag_name,
                                                 el_plain_text,
                                                 plain_text, title_idx)
         is_valid = False
         for tdist in title_dist:
             if tdist < 300 and len(el_plain_text) < 300:
                 is_valid = True
                 break
         if not is_valid:
             continue
         for child in el.children:
             if issubclass(child.__class__, Text):
                 l += len(filter(len, child.source.strip().split(' ')))
                 if l > 9 or l == 0:
                     break
         if l <= 9:
             elements.append(el)
     return elements
Ejemplo n.º 46
0
def search_cnn(result, term, howmany):
    RESULTS_URL = 'http://searchapp.cnn.com/cnn-search/query.jsp?query='+term+ \
              '&ignore=article|mixed&start=1&npp='+str(howmany)+'|'+str(howmany)+'|'+str(howmany)+'&s=all&type=all'+ \
              '&sortBy=date&primaryType=mixed&csiID=csi1'

    print "Getting CNN JSON blob"
    dom = get_dom(RESULTS_URL)
    print "Blob retrieved"
    
    #I do not understand this bizarre data structure
    for jscode in dom.by_id('jsCode'):
        search = json.loads(jscode.source())
        
    result['CNN'] = []
    results = search['results']
    titles = []
    for resultset in results:
        for article in resultset:
            title = plaintext(article['title'])
            link = article['url']
            if 'http://' in link and not title in titles: #exludes video, dupes
                content = heuristic_scrape(link)
                score = sentiment(content)
                if len(result['CNN']) < howmany:
                    result['CNN'].append((title, 'NEG' if score < 0 else 'POS'))
                    titles.append(title)
                    print 'CNN', title, link, 'NEG' if score < 0 else 'POS'
                else:
                    print 'too many'
                    break
    return result
Ejemplo n.º 47
0
def google_search(targetword, itemlist,targetpath):
    resultnum=0
    engine = Google(license=None)
    file = codecs.open(targetpath,'a','utf-8')
    outtext= ''
    patt = ur'\W+'
    for item in itemlist:
        for i in range(1,5):
            for result in engine.search(item, type=SEARCH, start=i):

                  url = URL(result.url)
                  text = url.download(unicode=True)

                  text = plaintext(text)
                  text = correctPersianString(text)
                  text = text.replace('\n',' ')
                  lines = text.split('.')
                  for line in lines:
                      if targetword in line:

                              match = re.findall(patt,line)
                              output =  ' '.join(match)

                              for item in punclist:
                                  if item in line:
                                      line = line.replace(item,' ')

                              print output
                              file.write(output)
                              file.write('\n')
    print str(resultnum)+" found in web"
    file.close()
Ejemplo n.º 48
0
def plain(text):
    '''
    Formats text as plaintext with utf-8.
    '''
    return unicodedata.normalize('NFKD',
                                 plaintext(text.encode('utf-8'))).encode(
                                     'ascii', 'ignore')
def obtain_data(url):
	'''
	Scrape the Wikipedia page.

	Args:
		url: pattern.web.URL instance pointing to the Wikipedia page

	Returns:
		A list of lists, where each sublist represents a data point. Each
		sublist contains two elements: a string with the name of the country,
		and a string with the size of the population of that country. 
	'''

	# Create a DOM of the URL.
	html = url.download(cached=True)
	dom = DOM(html)

	data_points = []

	for countries_table in dom.by_tag("table.wikitable sortable"):
		for table_row in countries_table.by_tag("tr")[1:]:	# The first row is the header, so start at index 1.
			table_row_content = []
			# Obtain the content of the row.
			for table_row_cell in table_row.by_tag("td"):
				table_row_cell_content = unicode(plaintext(table_row_cell.content))
				table_row_content.append(table_row_cell_content)
			# Obtain the country name and the population size.
			country = table_row_content[1].split("[")[0].split(" (")[0]
			population = "".join(table_row_content[2].split(","))
			data_point = [country, population]
			data_points.append(data_point)

	return data_points
Ejemplo n.º 50
0
    def _parse_item(self, item_element):
        item = {}
        for element_name in ['title', 'url', 'description', 'content']:
            element = item_element.find(
                self._item_paths[self.feed_type][element_name])
            if element is not None and element.text is not None:
                item[element_name] = plaintext(element.text)
            else:
                item[element_name] = None

        pubdate_element = item_element.find(
            self._item_paths[self.feed_type]['pubdate'])
        if pubdate_element is not None:
            item['pubdate'] = dateparser.parse(pubdate_element.text)
        else:
            item['pubdate'] = None

        enclosure_element = item_element.find(
            self._item_paths[self.feed_type]['enclosure'])
        if enclosure_element is not None:
            item['enclosure'] = enclosure_element.attrib
        else:
            item['enclosure'] = None

        return item
Ejemplo n.º 51
0
def process_ES():
  '''Processes the Spanish RSS feeds, locate entities and returns a list of tuples URI, DICT, where DICT contains entry's title, URL, plain text and list of entities.'''
  ES_RSS_LIST = [
    (u'Menéame', 'http://meneame.feedsportal.com/rss'),
    (u'Naukas', 'http://feeds.feedburner.com/naukas'),
    (u'Yuri', 'http://www.lapizarradeyuri.com/feed/')
  ]
  items = []
  # fetch the feed
  for feed in ES_RSS_LIST:
    feedlist = []
    for result in reader.search(feed[1])[:10]:
      clean_text = plaintext(result.text).encode('utf-8')
      params = urllib.urlencode({
        'key': '4c4ded0a7c279c9f747a8f750e223363', # topic extraction
        'of': 'json',
        'lang': 'es',
        'txt': clean_text,
        'tt': 'a',
        'dm': '5'
      })
      response = json.loads(urllib2.urlopen(url, params).read())
      # parse the entities
      entities = []
      if response['status']['msg'] == 'OK':
        for e in response['entity_list']:
          if e.has_key('semld_list'):
            for uri in e['semld_list']:
              if 'es.wikipedia' in uri:
                entities.append((e['form'], uri))
                break

      feedlist.append(dict(title=result.title, url=result.url, text=clean_text, entities=entities))
    items.append(dict(site=feed[0], feedlist=feedlist))
    return items
Ejemplo n.º 52
0
    def htmlSearch(self, html, url):
        logger.debug(u"htmlSearch URL : %s" % url)
        logger.debug(u"html : %s" % html[:20])
               
        s = html.lower()
        s = plaintext(s)
        s = parsetree(s)
        
        # self.logSentences(s)

        # Execute a Regular Expression Search
        p = r'(NN)+'
        q = search(p, s)

        # self.logPOS(q)

        # Iterate over all the words in the POS
        logger.debug(u"  q.Length=%d" % len(q))
        logger.debug(u"  q[]=%s" % q)
        
        self.g, self.urlConcepts, self.wordConcepts = self.addNodes(self.g, q, url,
                                                                    self.urlConcepts,
                                                                    self.wordConcepts)

        return self.urlConcepts, self.wordConcepts
Ejemplo n.º 53
0
def summarize(query=None, k=4, url=None):
    j = []
    if url:
        b = URL(url)
        a = Document(b.download(cached=True))
        for b in a.get_elements_by_tagname("p"):
            j.append(plaintext(b.content).encode("utf-8"))
        j = [
            word for sentence in j for word in sentence.split()
            if re.match("^[a-zA-Z_-]*$", word) or '.' in word or "'" in word
            or '"' in word
        ]
        j = ' '.join(j)
        lsa1 = LSA(stopwords, ignore_characters)
        sentences = j.split('.')
        sentences = [
            sentence for sentence in sentences
            if len(sentence) > 1 and sentence != ''
        ]
        for sentence in sentences:
            lsa1.parse(sentence)
    else:
        lsa1 = LSA(stopwords, ignore_characters)
        sentences = query.split('.')
        for sentence in sentences:
            lsa1.parse(sentence)
    lsa1.build()
    lsa1.calc()
    summary = [(sentences[i], norm(dot(diag(lsa1.S), lsa1.Vt[:, b]), 2))
               for i in range(len(sentences)) for b in range(len(lsa1.Vt))]
    sorted(summary, key=itemgetter(1))
    summary = dict(
        (v[0], v)
        for v in sorted(summary, key=lambda summary: summary[1])).values()
    return '.'.join([a for a, b in summary][len(summary) - (k):])
    def get_candidates(self, html):
        dom = DOM(html)
        if not dom.body:
            return []

        # feature: text length
        # filter out long blocks of text
        plain_text = plaintext(dom.body.source, keep=TAGS)
        title_idx = self.get_title_index(dom, plain_text)

        candidates = []
        for tag in TAGS:
            elements = self.get_elements_with_short_text(dom, tag, plain_text, title_idx)
            for el in elements:
                # looking for username
                words = get_words(el)
                # generate bi- and tri-grams
                text = ' '.join(words)
                bigrams = ngrams(text, 2)
                trigrams = ngrams(text, 3)
                for t in (bigrams+trigrams):
                    s = ' '.join(t)
                    candidates.append(Candidate(el, dom, s, plain_text, title_idx))
        print 'Candidates found %s' % len(candidates)
        return candidates
Ejemplo n.º 55
0
 def test_plaintext(self):
     # Assert plaintext: 
     # - strip <script>, <style>, <form>, <!-- --> elements,
     # - strip tags,
     # - decode entities,
     # - collapse whitespace,
     html = """
         <html>
         <head>
             <title>tags &amp; things</title>
         </head>
         <body>
             <div id="content">       \n\n\n\
                 <!-- main content -->
                 <script type="text/javascript>"alert(0);</script>
                 <h1>title1</h1>
                 <h2>title2</h2>
                 <p>paragraph1</p>
                 <p>paragraph2 <a href="http://www.domain.com" onclick="alert(0);">link</a></p>
                 <ul>
                     <li>item1&nbsp;&nbsp;&nbsp;xxx</li>
                     <li>item2</li>
                 <ul>
             </div>
             <br />
             <br />
         </body>
         </html>
     """
     self.assertEqual(web.plaintext(html, keep={"a": "href"}),
         u"tags & things\n\ntitle1\n\ntitle2\n\nparagraph1\n\nparagraph2 " + \
         u"<a href=\"http://www.domain.com\">link</a>\n\n* item1 xxx\n* item2")
     print "pattern.web.plaintext()"
Ejemplo n.º 56
0
def summarize(query=None, k=4,url=None):
    j = []
    if url:
        b = URL(url)
        a = Document(b.download(cached=True))
        for b in a.get_elements_by_tagname("p"):
            j.append(plaintext(b.content).encode("utf-8"))
        j = [word for sentence in j for word in sentence.split() if re.match("^[a-zA-Z_-]*$", word) or '.' in word or "'" in word or '"' in word]
        j = ' '.join(j)
        lsa1 = LSA(stopwords, ignore_characters)
        sentences = j.split('.')
        sentences = [sentence for sentence in sentences if len(sentence)>1 and sentence != '']
        for sentence in sentences:
            lsa1.parse(sentence)
    else:
        lsa1 = LSA(stopwords, ignore_characters)
        sentences = query.split('.')
        for sentence in sentences:
            lsa1.parse(sentence)
    lsa1.build()
    lsa1.calc()
    summary =[(sentences[i], norm(dot(diag(lsa1.S),lsa1.Vt[:,b]),2)) for i in range(len(sentences)) for b in range(len(lsa1.Vt))]
    sorted(summary, key=itemgetter(1))
    summary = dict((v[0],v) for v in sorted(summary, key=lambda summary: summary[1])).values()
    return '.'.join([a for a, b in summary][len(summary)-(k):])
Ejemplo n.º 57
0
def summarize_evaluation(query=None, url=None, summary=None):
    j=[]
    if url:
        b = URL(url)
        a = Document(b.download(cached=True))
        for b in a.get_elements_by_tagname("p"):
            j.append(plaintext(b.content).encode("utf-8"))
        j = [word for sentence in j for word in sentence.split() if re.match("^[a-zA-Z_-]*$", word) or '.' in word or "'" in word or '"' in word]
        j = ' '.join(j)
        lsa = LSA(stopwords, ignore_characters)
        sentences = j.split('.')
        sentences = [sentence for sentence in sentences if len(sentence)>1 and sentence != '']
        for sentence in sentences:
            lsa.parse(sentence)
    else:
        lsa = LSA(stopwords, ignore_characters)
        for sentence in query:
            lsa.parse(sentence)
    lsa.build()
    lsa.calc()
    lsa2 = LSA(stopwords, ignore_characters)
    for sentence in summary:
        lsa2.parse(sentence)
    lsa2.build()
    lsa2.calc()
    vectors =[(dot(lsa.S,lsa.U[0,:]),dot(lsa.S,lsa.U[i,:])) for i in range(len(lsa.U))]
    vectors2 =[(dot(lsa2.S,lsa2.U[0,:]),dot(lsa2.S,lsa2.U[i,:])) for i in range(len(lsa2.U))]
    angles = [arccos(dot(a,b)/(norm(a,2)*norm(b,2))) for a in vectors for b in vectors2]
    return str(abs(1 - float(angles[1])/float(pi/2)))
Ejemplo n.º 58
0
 def test_plaintext(self):
     # Assert plaintext: 
     # - strip <script>, <style>, <form>, <!-- --> elements,
     # - strip tags,
     # - decode entities,
     # - collapse whitespace,
     html = """
         <html>
         <head>
             <title>tags &amp; things</title>
         </head>
         <body>
             <div id="content">       \n\n\n\
                 <!-- main content -->
                 <script type="text/javascript>"alert(0);</script>
                 <h1>title1</h1>
                 <h2>title2</h2>
                 <p>paragraph1</p>
                 <p>paragraph2 <a href="http://www.domain.com" onclick="alert(0);">link</a></p>
                 <ul>
                     <li>item1&nbsp;&nbsp;&nbsp;xxx</li>
                     <li>item2</li>
                 <ul>
             </div>
             <br />
             <br />
         </body>
         </html>
     """
     self.assertEqual(web.plaintext(html, keep={"a": "href"}),
         u"tags & things\n\ntitle1\n\ntitle2\n\nparagraph1\n\nparagraph2 " + \
         u"<a href=\"http://www.domain.com\">link</a>\n\n* item1 xxx\n* item2")
     print "pattern.web.plaintext()"
Ejemplo n.º 59
0
def extract_tvseries(dom):
    '''
    Extract a list of highest rated TV series from DOM (of IMDB page).

    Each TV series entry should contain the following fields:
    - TV Title
    - Rating
    - Genres (comma separated if more than one)
    - Actors/actresses (comma separated if more than one)
    - Runtime (only a number!)
    '''

    # ADD YOUR CODE HERE TO EXTRACT THE ABOVE INFORMATION ABOUT THE
    # HIGHEST RATED TV-SERIES
    # NOTE: FOR THIS EXERCISE YOU ARE ALLOWED (BUT NOT REQUIRED) TO IGNORE
    # UNICODE CHARACTERS AND SIMPLY LEAVE THEM OUT OF THE OUTPUT.
    shows = []
    showinfo = {}

    # gets the title, rating, genre, actors and runtime, puts those in a dict
    # and puts the dict in an array

    for e in dom.by_tag("div.lister-item-content"):
        # gets the actor names
        for a in e.by_tag("p"):
            if "Stars" in a.content:
                info = plaintext(a.content).encode("utf-8").strip("Stars:")
        # gets the title, rating, genre and runtime
        title = plaintext((e.by_tag("a")[:1])[0].content)
        rating = plaintext((e.by_tag("strong"))[0].content)
        genre = plaintext((e.by_class("genre"))[0].content)
        actors = info
        runtime = plaintext((e.by_class("runtime"))[0].content.strip("min"))

        # creates a dict with the different values
        showinfo = {
            "title": title,
            "rating": rating,
            "genre": genre,
            "actors": actors,
            "runtime": runtime,
        }
        # appends the info into the shows array
        shows.append(showinfo)

    # replace this line as well as appropriate
    return shows