def load_positivity(): ''' Loads positivity data from Stanford IMDB dataset into memory as two lists: one text and the other binary 1 = positive, 0 = negative. This function concatenates the presorted testing and training data into one group. Output: quotes (text data), positivity (binary data) ''' quotes, positivity = [], [] for path in ['../Data/Training_Data/positivity/train/', './Data/Training_Data/positivity/test/']: # load all positive quotes for filename in os.listdir(path + 'pos/'): if(re.search(u'.txt', filename) is not None): with open(path + 'pos/' + filename) as f: data="".join(line.rstrip() for line in f) quotes.append(plaintext(data)) positivity.append(1) # load all negative quotes for filename in os.listdir(path + 'neg/'): if(re.search(u'.txt', filename) is not None): with open(path + 'neg/' + filename) as f: data="".join(line.rstrip() for line in f) quotes.append(plaintext(data)) positivity.append(0) return quotes, np.array(positivity)
def get_pattern_data(search_param): twitter = Twitter(language='en') for tweet in twitter.search(search_param, cached=True): print(plaintext(tweet.text).encode('ascii', 'ignore').decode('utf-8')) g = Graph() for i in range(10): for result in twitter.search(search_param, start=i + 1, count=50): s = result.text.lower() s = plaintext(s) s = parsetree(s) p = '{NP} (VP) ' + search_param + ' {NP}' for m in search(p, s): x = m.group(1).string # NP left y = m.group(2).string # NP right if x not in g: g.add_node(x) if y not in g: g.add_node(y) g.add_edge(g[x], g[y], stroke=(0, 0, 0, 0.75)) # R,G,B,A #if len(g)>0: # g = g.split()[0] # Largest subgraph. for n in g.sorted()[:40]: # Sort by Node.weight. n.fill = (0, 0.5, 1, 0.75 * n.weight) g.export('data', directed=False, weighted=0.6)
def construct_threads(self): for i in self.dump.by_tag("div.thread"): cur_thread = msg_classes.Thread() cur_thread.p1 = self.p1 thread_exists = False if plaintext(i.by_tag("span.profile fn")[0].content) == self.p1: cur_thread.p2 = plaintext(i.by_tag("span.profile fn")[1].content) else: cur_thread.p2 = plaintext(i.by_tag("span.profile fn")[0].content) # TODO if p1 and p2 have the same name, error! # assert cur_thread.p1 != cur_thread.p2 for e in i.by_tag("div.message"): cur_thread.add_message( plaintext(e.by_tag("div.from")[0].content).encode("utf-8"), e.by_tag("abbr.time published")[0].attributes['title'].encode("utf-8"), plaintext(e.by_tag("div.msgbody")[0].content).encode("utf-8") ) cur_thread.construct_conversations() for t in self.threads: if t.p2 == cur_thread.p2: thread_exists = True t.combine(cur_thread) if not thread_exists: self.threads.append(cur_thread)
def get_pattern_data(search_param): twitter = Twitter(language='en') for tweet in twitter.search(search_param, cached=True): print(plaintext(tweet.text).encode('ascii', 'ignore').decode('utf-8')) g = Graph() for i in range(10): for result in twitter.search(search_param, start=i+1,count=50): s = result.text.lower() s = plaintext(s) s = parsetree(s) p = '{NP} (VP) ' +search_param+ ' {NP}' for m in search(p, s): x = m.group(1).string # NP left y = m.group(2).string # NP right if x not in g: g.add_node(x) if y not in g: g.add_node(y) g.add_edge(g[x], g[y], stroke=(0,0,0,0.75)) # R,G,B,A #if len(g)>0: # g = g.split()[0] # Largest subgraph. for n in g.sorted()[:40]: # Sort by Node.weight. n.fill = (0, 0.5, 1, 0.75 * n.weight) g.export('data', directed=False, weighted=0.6)
def get_patent(url): url = URL(url + "/fulltext") html = url.download() dom = DOM(html) title = plaintext(dom("h3 a")[0].content) body = plaintext(dom("#contents")[0].content) return [title, body]
def google_search(match,targetfile): engine = Google(license=None) for i in range(1,10): for result in engine.search(match, type=SEARCH, start=i): print plaintext(result.description) targetfile.write(plaintext(result.description)) targetfile.write('\n')
def google_search(match, targetfile): engine = Google(license=None) for i in range(1, 10): for result in engine.search(match, type=SEARCH, start=i): print plaintext(result.description) targetfile.write(plaintext(result.description)) targetfile.write('\n')
def searchQueriesWithPatterns(self, queries): # Faster, but still need to refine extraction patterns results = [] for q, w1, d in queries: print "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" print q print "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" for r in self._engine.search(q, count=50): # Each result is given a preliminary score based on the weight of the query # that retrieved it and the pattern that was matched to it for p, w2 in self.getPatterns(q): if d == 'L': m = re.search('(.*?)' + p + '.*\.', plaintext(r.txt), re.IGNORECASE) else: m = re.search(p + '(.*)', plaintext(r.txt), re.IGNORECASE) if m: print plaintext(r.txt) print "-------------------------------------------------" print p, "generated", m.group(1) print "=================================================" results.append((m.group(1), w1 + w2)) break return results
def get_patent(url): url = URL(url + "/fulltext") html = url.download() dom = DOM(html) title = plaintext(dom('h3 a')[0].content) body = plaintext(dom('#contents')[0].content) return [title, body]
def get_array(sentences, raw_query, work_range, similarity_threshold): reply_unsorted = [] reply_sorted = [] sort_linker = {} sort_values = [] index = 0 for sentence in sentences[work_range['start']:work_range['end']]: similarity_value = symmetric_sentence_similarity(sentence, raw_query) if similarity_value > similarity_threshold: sort_values.append(similarity_value) try: # remove because it holds meaning no more. reply_unsorted.append(sentence.encode('utf-8')) # old sort_linker[similarity_value] = sentence.encode('utf-8') # old # `reply_unsorted.append(plaintext(sentence).encode('utf-8')) # old # sort_linker[similarity_value] = plaintext(sentence).encode('utf-8') # old # reply_unsorted.append(str(plaintext(sentence))) # oldest # sort_linker[similarity_value] = str(plaintext(sentence)) # oldest except (UnicodeDecodeError, UnicodeEncodeError): reply_unsorted.append(plaintext(sentence).encode('utf-8')) sort_linker[similarity_value] = plaintext(sentence).encode( 'utf-8') finally: index += 1 sort_values = sorted(sort_values, reverse=False) # alternatively sort_values.sort() reply_sorted = [sort_linker[index] for index in sort_values] return reply_sorted
def main(): table = Datasheet() tel = '' street = '' locality = '' title = '' for i in range(3): page = i+1 url = URL("http://torino.paginegialle.it/pgol/4-veterinari/3-torino/p-%s?mr=50" % page) print "collecting from %s" % url connection = url.open() doc = Document( connection.read() ) items = doc.by_class('item_sx') row = [] for j, item in enumerate(items): divs = item.by_class('address') try: title = item.by_class('item_head')[0].by_tag('a')[0].content except IndexError, e: print >> sys.stderr, "%s" % j, e pass for z, div in enumerate(divs): if div != None: try: street = div.by_class('street-address')[0].content locality = div.by_class('locality')[0].content tel = div.by_class('tel')[0].by_class('value')[0].content except IndexError, e: print >> sys.stderr, "%s" % z, e pass save = "%s, %s %s, %s \n" % ( plaintext(title), plaintext(street).replace(",", ""), plaintext(locality).replace('(TO)', ''), plaintext(tel).replace(",", "") ) print >> sys.stderr, save row.append(save)
def plainTextConverter(self, link, metodo="SinEtiquetas"): reload(sys) sys.setdefaultencoding('utf-8') url = URL(link) txtContent = "" try: if url.mimetype in MIMETYPE_PDF: document = open('temp.pdf', 'w') document.close() download = url.download() document = open('temp.pdf', 'a') document.write(download) document.close() #txtContent=os.system('pdf2txt.py temp.pdf') txtContent = commands.getoutput('pdf2txt.py temp.pdf') else: page = URL(url).download(user_agent='Mozilla/5') if metodo == "mantenerEtiquetas": txtContent = plaintext(page, keep={ 'title': [], 'h1': [], 'h2': [], 'strong': [] }) else: txtContent = plaintext(page, keep={}) except: pass return txtContent
def exctract_values(self, dom, myInfo): for a in dom.by_tag("a.denomination-links pj-lb pj-link" ): # First <a class="title"> in entry. myInfo.names.append(self.decode_if_unicode(plaintext(a.content))) #adresses for a in dom.by_tag("a.adresse pj-lb pj-link" ): # First <a class="title"> in entry. myInfo.adresses.append(self.decode_if_unicode(plaintext( a.content))) numbers = re.findall(r'\d+', self.decode_if_unicode(plaintext(a.content))) myInfo.districts.append(numbers[-1]) #telephones for a in dom.by_tag("ul.main-contact-container clearfix" ): # First <a class="title"> in entry. contact = [] for e in a.by_tag("div.tel-zone noTrad"): contact.append( self.decode_if_unicode(plaintext(e.content))[-14:]) ''' telephone_number=re.findall(r'\d+',self.decode_if_unicode(plaintext(a.content))) telephone_string="" for s in telephone_number: telephone_string=telephone_string+str(s) contact.append(telephone_string) ''' myInfo.contacts.append(contact)
def loadPage(numPage): #Load the content from the given page url = URL(URL_YAHOO + str(numPage)) dom = DOM(url.download(cached=True)) for row in dom(ROWS_PATH)[1:]: #pprint.pprint(plaintext(row(CELLS_PATH)[0].content)) TICKETS.append({"symbol": plaintext(row(CELLS_PATH)[0].content), "name": plaintext(row(CELLS_PATH)[1].content) }) pprint.pprint(str(numPage + 1) + "/" + str(NUM_PAGES))
def blogsData(blogs,time,db): d = feedparser.parse(blogs) dicS = {} dicS['rssLink'] = blogs dicS['titleBlog'] = d['feed']['title'].encode('utf-8').replace("\xe2\x80\x99","'") dicS['descriptionBlog'] = d['feed']['description'].encode('utf-8').replace("\xe2\x80\x99","'") dicS['updated'] = d['feed']['updated'].encode('utf-8') db.sources.update({"titleBlog":dicS['titleBlog']},dicS,upsert=True) for item in d['entries']: dic = {} dic['titlePost'] = item.title.encode('utf-8').replace("\xe2\x80\x99","'") if db.articles.find({"titlePost":dic['titlePost']}).count() == 0: dic['titleBlog'] = d['feed']['title'].encode('utf-8').replace("\xe2\x80\x99","'") dic['descriptionBlog'] = d['feed']['description'].encode('utf-8').replace("\xe2\x80\x99","'") dic['link'] = item.link.encode('utf-8') dic['date']=datetime.datetime.utcnow() if "author" in item: dic['author'] = item.author.encode('utf-8').replace("(","").replace(")","") #Detag content, define parsing rules for outfits and #set nltk process if "content" in item: text = plaintext(item.content[0]['value']) dic['content'] = text.encode('utf-8').replace("\xe2\x80\x99","'").replace("\n"," ").replace(":"," ") dom = DOM(item.content) imagesUrl = [] for e in dom('img'): imagesUrl.append(e.attributes.get('src','').encode('utf-8')) dic['images'] = imagesUrl dic['entities']=nlpModules.extract_entities_regex(dic['titlePost']+" "+text.encode('utf-8')) sentiment=nlpModules.sentimentAnalysis(text.encode('utf-8')) dic['sentimentScore']=sentiment[0] dic['sentimentCategory']=sentiment[1] elif item.summary: text = plaintext(item.summary) dic['content'] = text.encode('utf-8').replace("\xe2\x80\x99","'").replace("\n"," ").replace(":"," ") dom = DOM(item.description) imagesUrl = [] for e in dom('img'): imagesUrl.append(e.attributes.get('src','').encode('utf-8')) dic['images'] = imagesUrl dic['entities']=nlpModules.extract_entities_regex(dic['titlePost']+". "+text.encode('utf-8')) sentiment=nlpModules.sentimentAnalysis(text.encode('utf-8')) dic['sentimentScore']=sentiment[0] dic['sentimentCategory']=sentiment[1] if item.published: dic['published'] = item.published.encode('utf-8') if "tags" in item: tags = [] for tag in item.tags: tags.append(tag.term.encode('utf-8')) dic['tags'] = tags #db.articles.insert(dic) db.articles.update({"titlePost":dic['titlePost']},dic,upsert=True)
def extract_tvseries(dom): ''' Extract a list of highest ranking TV series from DOM (of IMDB page). Each TV series entry should contain the following fields: - TV Title - Ranking - Genres (comma separated if more than one) - Actors/actresses (comma separated if more than one) - Runtime (only a number!) ''' # ADD YOUR CODE HERE TO EXTRACT THE ABOVE INFORMATION ABOUT THE # HIGHEST RANKING TV-SERIES # NOTE: FOR THIS EXERCISE YOU ARE ALLOWED (BUT NOT REQUIRED) TO IGNORE # UNICODE CHARACTERS AND SIMPLY LEAVE THEM OUT OF THE OUTPUT. series = [] # Loops over top 5 imdb series for index in dom.by_tag("tr.even detailed")[:5]: actors = [] genres = [] serie = [] # Extracts the required fields of the html for td in index.by_tag("td.number")[:1]: ranking = unicode(plaintext(td.content)) # Extract ranking for td in index.by_tag("td.title")[:1]: for a in td.by_tag("a")[:1]: title = unicode(plaintext(a.content)) # Extract title for span in td.by_tag("span.credit")[:1]: for a in span.by_tag("a"): actors.append(unicode(plaintext(a.content))) # Extract actors for span in td.by_tag("span.genre")[:1]: for a in span.by_tag("a"): genres.append(unicode(plaintext(a.content))) # Extract genres for span in td.by_tag("span.runtime")[:1]: runtime = unicode(plaintext(span.content)) # Extract runtime with minute runtime_split = split_string(runtime, ' ') # Split number from minute runtime_num = runtime_split[0] # append required fields to serie list serie.append(title) serie.append(ranking) serie.append(genres) serie.append(actors) serie.append(runtime_num) # appends serie to series series.append(serie) return series
def inflect(word, language="italian"): inflections = {} url = "http://en.wiktionary.org/wiki/" + word.replace(" ", "_") dom = DOM(URL(url).download(throttle=10, cached=True)) pos = "" # Search the header that marks the start for the given language: # <h2><span class="mw-headline" id="Italian">Italian</span></h2> e = dom("#" + language)[0].parent while e is not None: # e = e.next_sibling if e.type == "element": if e.tag == "hr": # Horizontal line = next language. break if e.tag == "h3": # <h3>Adjective [edit]</h3> pos = plaintext(e.content.lower()) pos = pos.replace("[edit]", "").strip()[:3].rstrip("ouer") + "-" # Parse inflections, using regular expressions. s = plaintext(e.content) # affetto m (f affetta, m plural affetti, f plural affette) if s.startswith(word): for gender, regexp, i in ( ("m" , r"(" + word + r") m", 1), ("f" , r"(" + word + r") f", 1), ("m" , r"(" + word + r") (mf|m and f)", 1), ("f" , r"(" + word + r") (mf|m and f)", 1), ("m" , r"masculine:? (\S*?)(,|\))", 1), ("f" , r"feminine:? (\S*?)(,|\))", 1), ("m" , r"(\(|, )m(asculine)? (\S*?)(,|\))", 3), ("f" , r"(\(|, )f(eminine)? (\S*?)(,|\))", 3), ("mp", r"(\(|, )m(asculine)? plural (\S*?)(,|\))", 3), ("fp", r"(\(|, )f(eminine)? plural (\S*?)(,|\))", 3), ( "p", r"(\(|, )plural (\S*?)(,|\))", 2), ( "p", r"m and f plural (\S*?)(,|\))", 1)): m = re.search(regexp, s, re.I) if m is not None: # {"adj-m": "affetto", "adj-fp": "affette"} inflections[pos + gender] = m.group(i) #print s e = e.next_sibling return inflections
def blogsData(blogs): d = feedparser.parse(blogs) db = get_db('dev-itoutfits') index_name = 'content' conn = get_db_es(index_name) for item in d['entries']: dic = {} dic['titlePost'] = item.title.encode('utf-8').replace("\xe2\x80\x99","'") num =db.content.find({"titlePost":dic['titlePost']}).count() if num == 0: #print dic['titlePost'], num dic['titlePostUrl'] = item.title.encode('utf-8').replace("\xe2\x80\x99","'").replace(" ", "") dic['titleBlogUrl'] = d['feed']['title'].encode('utf-8').replace("\xe2\x80\x99","'").replace(" ", "") dic['rssLink'] = blogs dic['titleBlog'] = d['feed']['title'].encode('utf-8').replace("\xe2\x80\x99","'") dic['updated'] = d['feed']['updated'].encode('utf-8') dic['descriptionBlog'] = d['feed']['description'].encode('utf-8').replace("\xe2\x80\x99","'") dic['link'] = item.link.encode('utf-8') dic['date']=datetime.datetime.utcnow() #Detag content, define parsing rules for outfits and #set nltk process if item.content[0]: text = plaintext(item.content[0]['value']) dic['content'] = text.encode('utf-8').replace("\xe2\x80\x99","'").replace("\n"," ").replace(":"," ") dom = DOM(item.content) imagesUrl = [] for e in dom('img'): imagesUrl.append(e.attributes.get('src','').encode('utf-8')) dic['images'] = imagesUrl elif item.description: text = plaintext(item.description) dic['content'] = text.encode('utf-8').replace("\xe2\x80\x99","'").replace("\n"," ").replace(":"," ") dom = DOM(item.description) imagesUrl = [] for e in dom('img'): imagesUrl.append(e.attributes.get('src','').encode('utf-8')) dic['images'] = imagesUrl if item.published: dic['published'] = item.published.encode('utf-8') if "tags" in item: tags = [] for tag in item.tags: tags.append(tag.term.encode('utf-8')) dic['tags'] = tags try: #print dic db.content.update({"titlePost":dic['titlePost']},dic,upsert=True) #type_name="article" #conn.index(json.dumps(dic), index_name, type_name) except ValueError: pass
def extract_tvseries(dom): ''' Extract a list of highest rated TV series from DOM (of IMDB page). Each TV series entry should contain the following fields: - TV Title - Rating - Genres (comma separated if more than one) - Actors/actresses (comma separated if more than one) - Runtime (only a number!) ''' # ADD YOUR CODE HERE TO EXTRACT THE ABOVE INFORMATION ABOUT THE # HIGHEST RATED TV-SERIES # NOTE: FOR THIS EXERCISE YOU ARE ALLOWED (BUT NOT REQUIRED) TO IGNORE # UNICODE CHARACTERS AND SIMPLY LEAVE THEM OUT OF THE OUTPUT. tv_series =[] series_list = [] genre_list =[] runtime_list = [] rating_list= [] table = [] actor_row = [] actors_list = [] for e in dom.get_elements_by_tagname("div.lister-item-content")[:20]: # Top 20 entries for h3 in e('h3 a'): series_list.append(encode(plaintext(h3.content))) for p in e.get_elements_by_tagname("p.text-muted"): for span in e.get_elements_by_tagname("span.runtime"): # Runtime runtime_list.append(plaintext(span.content)[:2]) for span in e.get_elements_by_tagname("span.genre"): # Genre genre_list.append(plaintext(span.content)) for div in e.get_elements_by_tagname("div.ratings-bar"): for div in e.get_elements_by_tagname("div.inline-block ratings-imdb-rating"): rating_list.append(plaintext(div.content)) #rating for p in e('p a'): # for loop for scraping actors s = " " actor_row.append(plaintext(p.content)) # actor names are separated strings, this joins the string into one element for the table actors_list.append(s.join(actor_row)) actor_row [:] = [] for series, rating, genre, actors, runtime in zip(series_list, rating_list, genre_list, actors_list, runtime_list): table = (series, rating, genre, actors, runtime) tv_series.append(table) # creating the table with all information return tv_series
def extract_tvseries(dom): ''' Extract a list of highest ranking TV series from DOM (of IMDB page). Each TV series entry should contain the following fields: - TV Title - Ranking - Genres (comma separated if more than one) - Actors/actresses (comma separated if more than one) - Runtime (only a number!) ''' # ADD YOUR CODE HERE TO EXTRACT THE ABOVE INFORMATION ABOUT THE # HIGHEST RANKING TV-SERIES # NOTE: FOR THIS EXERCISE YOU ARE ALLOWED (BUT NOT REQUIRED) TO IGNORE # UNICODE CHARACTERS AND SIMPLY LEAVE THEM OUT OF THE OUTPUT. tvseries = [] for table_row in dom.by_tag("tr")[1:51]: # The first table row is redundant, so start from index 1. # Default values in case something is missing. title = "-" rating = "-" actors = "-" genre = "-" runtime = "-" for table_cell in table_row.by_tag("td.title"): for a in table_cell.by_tag("a")[:1]: # Obtain the title. title = unicode(plaintext(a.content)) for rating_span in table_cell.by_tag("span.rating-rating"): # Obtain the rating. rating = unicode(plaintext(rating_span.content)) rating = rating.split("/")[0] for credit_span in table_cell.by_tag("span.credit"): # Obtain the actors/actresses. actors = unicode(plaintext(credit_span.content)) actors = actors.split(": ")[1] for genre_span in table_cell.by_tag("span.genre"): # Obtain the genre(s). genre = unicode(plaintext(genre_span.content)) genre = ", ".join(genre.split(" | ")) for runtime_span in table_cell.by_tag("span.runtime"): # Obtain the runtime. runtime = unicode(plaintext(runtime_span.content)) runtime = runtime.split(" ")[0] tvseries_item = [title, rating, genre, actors, runtime] tvseries.append(tvseries_item) return tvseries
def get_craigslist_postings(): postings = json.load(open('results.json', 'r')) # filter cross postings filtered = {} for boat_name, posts in postings.items(): founds = [] urls = [] for post in posts: _hash = post.split('/')[-2] if _hash not in founds: founds.append(_hash) urls.append(post) filtered[boat_name] = urls for boat_name, posts in filtered.items(): for post in posts: r = requests.get(post) w = web.Element(r.content) body = w.by_id('postingbody') content = body.content links = body.by_tag('a') if links: content = content.replace(links[0].content, '') c = w.by_class("print-qrcode-label") if c: content = content.replace(c[0].content, '') content = web.plaintext(content) formatted_attrs = {} attrs = w.by_class('attrgroup')[0].by_tag('span') for attr in attrs: values = web.plaintext(attr.content).split(': ') if len(values) == 2: key = values[0].replace(' ', '_').replace('/','') value = values[1] formatted_attrs[key] = value price = web.plaintext(w.by_class('price')[0].content.replace('$','')) post_data = { 'body': { 'source': content, 'link': post, 'attrs': formatted_attrs, 'boat': boat_name, 'price': price }, "index": "listings", 'doc_type': 'listing' } res = es.index(**post_data)
def extract_tvseries(dom): ''' Extract a list of highest ranking TV series from DOM (of IMDB page). Each TV series entry should contain the following fields: - TV Title - Ranking - Genres (comma separated if more than one) - Actors/actresses (comma separated if more than one) - Runtime (only a number!) ''' tv_series = [] for e in dom.by_tag("td.title")[:250]: #amount of titles tv_serie = [] for a in e.by_tag("a")[:1]: #search for title title = plaintext(a.content) tv_serie.append(title) print title for value in e.by_class("value"): #search for ranking value = plaintext(value.content) tv_serie.append(value) for genre in e.by_class("genre"): #search for genre genre = plaintext(genre.content) genre = genre.replace(' | ',',') tv_serie.append(genre) for credit in e.by_class("credit"): #search for actors/actresses credit = plaintext(credit.content) credit = credit.replace('With: ','') credit = credit.replace(', ',',') tv_serie.append(credit) for runtime in e.by_class("runtime"): #search for runtime runtime = plaintext(runtime.content) runtime = runtime.replace(' mins.','') tv_serie.append(runtime) tv_series.append(tv_serie) # ADD YOUR CODE HERE TO EXTRACT THE ABOVE INFORMATION ABOUT THE # HIGHEST RANKING TV-SERIES # NOTE: FOR THIS EXERCISE YOU ARE ALLOWED (BUT NOT REQUIRED) TO IGNORE # UNICODE CHARACTERS AND SIMPLY LEAVE THEM OUT OF THE OUTPUT. return tv_series # replace this line as well as appropriate
def extract_tvseries(dom): ''' Extract a list of highest ranking TV series from DOM (of IMDB page). Each TV series entry should contain the following fields: - TV Title - Ranking - Genres (comma separated if more than one) - Actors/actresses (comma separated if more than one) - Runtime (only a number!) ''' # ADD YOUR CODE HERE TO EXTRACT THE ABOVE INFORMATION ABOUT THE # HIGHEST RANKING TV-SERIES # NOTE: FOR THIS EXERCISE YOU ARE ALLOWED (BUT NOT REQUIRED) TO IGNORE # UNICODE CHARACTERS AND SIMPLY LEAVE THEM OUT OF THE OUTPUT. komma = ", " # komma om tussen acteurs en genres te zetten imdb = [] # imdb als list for data in dom.by_tag("td.title"): # zoek in table gegevens van serie gegevens = [] title = data.by_tag("a")[0] # zoek naar eerste a tag, want dat is de titel title.content.encode('ascii', 'ignore') # voorkomt unicode error gegevens.append(title.content) rating = data.by_tag("span.value")[0] # zoek naar ratings gegevens.append(plaintext(rating.content)) # voegt titel aan gegevens toe for genres in data.by_tag("span.genre"): soort = [] # maakt list soort for genre in genres.by_tag("a"): # zoek naar genre soort.append(plaintext(genre.content)) # voegt genres toe aan soort seq = komma.join(soort) # maakr van list een string gegevens.append(seq) # voegt genre string aan gegevens for actors in data.by_tag("span.credit"): acteurs = [] # maak list acteurs for actor in actors.by_tag("a"): # zoek naar acteurs actor.content.encode('ascii', 'ignore') # voorkomt unicode error acteurs.append(plaintext(actor.content))# voegt gevonden acteurs aan list sq = komma.join(acteurs) # maakt van list string gegevens.append(sq) # voegt acteur string aan gegegevens toe runtime = data.by_tag("span.runtime")[0] # zoek naar runtime gegevens.append(runtime.content.partition(' ')[0]) # voegt runtime aan gegevens toe imdb.append(gegevens) # voegt de list van gegevens aan list imdb return imdb # return list
def extract_tvseries(dom): ''' Extract a list of highest ranking TV series from DOM (of IMDB page). Each TV series entry should contain the following fields: - TV Title - Ranking - Genres (comma separated if more than one) - Actors/actresses (comma separated if more than one) - Runtime (only a number!) ''' # Create list for series series = [] # Loop over the series and substract the needed information for e in dom.by_tag(".title"): serie = [] # Substract title of the serie serie.append(plaintext(e.by_tag('a')[0].content).encode('utf-8')) # Substract ranking of the serie serie.append(plaintext(e.by_tag(".value")[0].content).encode('utf-8')) # Substract genres of the serie genres = [] for a in e.by_tag(".genre")[:1]: for b in a.by_tag("a"): genres.append(plaintext(b.content).encode('utf-8')) serie.append(', '.join(genres)) # Substract actors of the serie actors = [] for a in e.by_tag("span.credit")[:1]: for b in a.by_tag("a"): actors.append(plaintext(b.content).encode('utf-8')) serie.append(', '.join(actors)) # If runtime is known: append runtime of serie # If runtime is not known: set runtime to zero if e.by_tag('.runtime'): serie.append( plaintext(e.by_tag('.runtime')[0].content).replace( ' mins.', "").encode('utf-8')) else: serie.append('0') series.append(serie) # Returns a list of series return series
def extract_tvseries(dom): ''' Extract a list of highest ranking TV series from DOM (of IMDB page). Each TV series entry should contain the following fields: - TV Title - Ranking - Genres (comma separated if more than one) - Actors/actresses (comma separated if more than one) - Runtime (only a number!) ''' # ADD YOUR CODE HERE TO EXTRACT THE ABOVE INFORMATION ABOUT THE # HIGHEST RANKING TV-SERIES # NOTE: FOR THIS EXERCISE YOU ARE ALLOWED (BUT NOT REQUIRED) TO IGNORE # UNICODE CHARACTERS AND SIMPLY LEAVE THEM OUT OF THE OUTPUT. dom = DOM(TARGET_URL.download(cached=True)) # Get top 50 results for e in dom.by_tag("td.title"): # get title for a in e.by_tag("a")[:1]: title = plaintext(a.content) print title print # get ranking for td in e.by_tag("span.value")[:1]: ranking = plaintext(td.content) print ranking print # get genre for span in e.by_tag("span.genre")[:1]: genre = plaintext(span.content) print genre print # get actors/actresses for span in e.by_tag("span.credit")[:1]: actors = plaintext(span.content) print actors print # get runtime (number) for span in e.by_tag("span.runtime")[:1]: runtime = plaintext(span.content) print runtime print # create a dictionary of all the retrieved info showlist[e] = {title, ranking, genre, actors, runtime}
def __init__(self, data, url="", contenidoBd=""): if url != "": urlContent = UrlToPlainText() self.contenidoConEtiquetas = urlContent.plainTextConverter( url, "mantenerEtiquetas") self.contenido = plaintext(self.contenidoConEtiquetas, keep={}) else: if (contenidoBd != ""): self.contenidoConEtiquetas = contenidoBd self.contenido = plaintext(self.contenidoConEtiquetas, keep={}) else: self.contenido = "" self.data = count(words(Sentence(parse(self.contenido))), stemmer=PORTER)
def descargarContenido(self, url): """Metodo para descargar el contenido de los documentos webs siendo url o pdf""" try: unaUrl = URL(url) if "pdf" in extension(unaUrl.page): return self.descargarPDF(unaUrl) else: return plaintext(unaUrl.download()) except Exception as e: try: return plaintext(self.urlLibDescarga(url)) except Exception as e: print "except " + str(e) print url
def extract_tvseries(dom): ''' Extract a list of highest rated TV series from DOM (of IMDB page). Each TV series entry should contain the following fields: - TV Title - Rating - Genres (comma separated if more than one) - Actors/actresses (comma separated if more than one) - Runtime (only a number!) ''' series = [] # The parent of all fields super_parent = dom.by_tag("div.lister-item-content") for item in super_parent: # The genre(s). genre = plaintext(item.by_tag("span.genre")[0].content) print genre # The runtime runtime = plaintext(item.by_tag("span.runtime")[0].content) print runtime # The rating. rating = plaintext(item.by_tag("span.lister-item-index")[0].content) print rating # The title. title = plaintext( item.by_tag("a")[0].content.encode('ascii', 'ignore')) print title # The actors actors = plaintext( item.by_tag("p.")[2].content.encode('ascii', 'ignore')) actors = actors[6:] print actors print "_______________________________" series.append([title, rating, genre, actors, runtime]) # ADD YOUR CODE HERE TO EXTRACT THE ABOVE INFORMATION ABOUT THE # HIGHEST RATED TV-SERIES # NOTE: FOR THIS EXERCISE YOU ARE ALLOWED (BUT NOT REQUIRED) TO IGNORE # UNICODE CHARACTERS AND SIMPLY LEAVE THEM OUT OF THE OUTPUT. return series # replace this line as well as appropriate
def extract_tvseries(dom): ''' Extract a list of highest ranking TV series from DOM (of IMDB page). Each TV series entry should contain the following fields: - TV Title - Ranking - Genres (comma separated if more than one) - Actors/actresses (comma separated if more than one) - Runtime (only a number!) ''' # Trying to make a replacer for some symbols #replaceLetters = ["î","û","ô","ê","â"] # Making the list in which we store all the data showData = [] for e in dom.by_tag( "td.title")[:50]: # This lets us select the individual shows # Here we store the data for every individual show, so we can add it to the overall list show = [] # We now get the data for individual shows, and we add every attribute to our list for titles in e.by_tag("a")[:1]: # Title of a series title = plaintext(titles.content) # for ch in replaceLetters: # if ch in title: # title = title.replace(ch,'') show.append(title) for ratings in e.by_class("value"): # Rating for a series rating = plaintext(ratings.content) show.append(rating) for genres in e.by_class("genre"): # Genre of a series genre = plaintext(genres.content) genre = genre.replace(" | ", ',') # Cleaning our output show.append(genre) for names in e.by_class("credit"): # Main actors of a series name = plaintext(names.content) name = name.replace("With: ", '') # Cleaning our output show.append(name) for runtimes in e.by_class("runtime"): # Runtime of a series runtime = plaintext(runtimes.content) runtime = runtime.replace("mins.", '') # Cleaning our output show.append(runtime) # Adding all the data of this individual show to the total list showData.append(show) # Returning our complete list of all the data for the shows return showData
def extract_tvseries(dom): # dom = DOM (url.download(cached=True)) ''' Extract a list of highest ranking TV series from DOM (of IMDB page). Each TV series entry should contain the following fields: - TV Title - Ranking - Genres (comma separated if more than one) - Actors/actresses (comma separated if more than one) - Runtime (only a number!) ''' # upcoming 2 for loops implemented from the 12-dom.py example in the pattern library # Create lists and strings to store the data tvseries = [] # In the IMDB HTML(tree), look for the td class 'title' which contains all the needed information for e in dom.by_tag("td.title")[:50]: Movie = [] # In the branch 'a' get the title of the movie for title in e.by_tag("a")[:1]: # Add the title name to the string 'Titles' title = plaintext(title.content).encode('utf-8') Movie.append(title) # In the branch 'ranking', get the ranking of the movie for ranking in e.by_tag("div.user_rating"): # Add the rankings to the string 'Rankings' Movie.append(str(plaintext(ranking.content))[11:-4]) # Create list with Genres, comma seperated (string) for genre in e.by_tag("span.genre"): Movie.append(str(plaintext(genre.content))) # Replace separator | with , # genre = genre.replace('|', ',') # Create list with Actors, comma seperated (string) for actor in e.by_tag("span.credit"): actor = plaintext(actor.content).encode('utf-8')[6:] Movie.append(actor) # Create list with Runtime (numeric) for time in e.by_tag("span.runtime"): Movie.append(int(str(plaintext(time.content))[:2])) tvseries.append(Movie) return tvseries
def get_title_index(self, dom, plain_text): title_el = dom('title') if title_el: # get title text title = plaintext(title_el[0].source) # usually title in <title> tag is a little bit different from # acutal article title - publishers tend to add website name or author name either to the # end or to the beginning of the title, for example: # # Insurers: Despite deadline, Obamacare glitches persist - CNN.com # India's Dating Sites Skip Straight to the Wedding - P. Nash Jenkins - The Atlantic # # But it will be separated by either : or -, so let's take substring # two first such separators title_parts = re.split('\:|\-|\|', title) # find title in tagless text # part_idx = 1 if len(title_parts) > 2 else 0 part_idx = 0 part_len = len(title_parts[0]) for idx, part in enumerate(title_parts): if len(part) > part_len: part_idx = idx part_len = len(part) title_idx = [m.start() for m in re.finditer(title_parts[part_idx].strip(), plain_text)] return title_idx return []
def getTwits(self, keyWord): if len(keyWord) == 0: keyWord = u'"gündem"' self.lineEdit.setText(keyWord) self.alText = u'' try: tList = self.twitter.search(keyWord, start=self.prevId, count=10, cached=False) except: message = "Twitter Aram Limiti Lütfen Biraz Bekleyin" QtGui.QMessageBox.information(self.dialog, "Information", "Python rocks!") for tweet in tList: self.listWidget.addItem( QtGui.QListWidgetItem(cleanTweet(tweet.text))) self.twIds.append(tweet.id) self.listWidget.setCurrentRow(self.listWidget.count() - 1) tweet.text = self.filterRT(tweet.text) tweet.text = self.filterLink(tweet.text) self.alText = self.alText + plaintext(tweet.text) + u' ' self.prevId = tweet.id
def get_artist_docs(name): default_dir = basedir + name rap_docs = "" # get a list of all the files in default dir for f in os.listdir(default_dir): # go to that dir os.chdir(default_dir) # open the file fi = open(f, 'r') # print "reading " + f # slurp page = fi.read() # what does this do? dom = DOM(page) # we look at the page and get that the thing we want is in the .lyrics div. if dom and dom('.lyrics'): lyrics = dom('.lyrics')[0] else: continue p = plaintext(lyrics.content) rap_docs += p return rap_docs
def search_bbc(result, term, howmany, rurl, page): # convenient text only search if not rurl == None and "www.bbc.co.uk" in rurl: query = rurl # for pagination support else: query = "http://www.bbc.co.uk/search/news/" + term.lower() + "?text=on" dom = get_dom(query) if "BBC" not in result: result["BBC"] = [] for a in dom.by_class("title"): title = plaintext(a.content) link = a.attributes["href"] content = heuristic_scrape(link) score = sentiment(content) if len(result["BBC"]) < howmany: result["BBC"].append((title, "NEG" if score < 0 else "POS")) print "BBC", title, link, "NEG" if score < 0 else "POS" if len(result["BBC"]) % 20 == 0 and len(result["BBC"]) < howmany: # 20 articles per page print "flip" next_page = ( "http://www.bbc.co.uk/search/news/" + term + "?page=" + str(page + 1) + "&text=on&dir=fd&news=" + str(len(result["BBC"]) + 1) + "&news_av=1" ) return search_bbc(result, term, howmany, next_page, page + 1) else: break return result
def load_support(): ''' Loads support / oppose data from political speeches dataset into memory as two lists: one text and the other binary 1 = support, 0 = no oppose. This function groups the presorted testing and training groups into one set. Output: quotes (text data), support (binary data) ''' quotes, support = [], [] for path in ['../Data/Training_Data/support_oppose/data_stage_one/training_set/', '../Data/Training_Data/support_oppose/data_stage_one/test_set/']: # load all quotes for filename in os.listdir(path): if(re.search(u'.txt', filename) is not None): with open(path+filename) as f: data="".join(line.rstrip() for line in f) quotes.append(plaintext(data)) # determine if last letter is a Y or N for support binary data if filename.split('.txt')[0][-1] == 'Y': support.append(1) else: support.append(0) return quotes, np.array(support)
def obtain_names(div_element): ''' Scrapes a <div> element from a movie's web page. The <div> element must contains names. On a movie's web page, names of directors, writers, and actors are all in a separate <div>, for example (for directors): ------------------------------------------------------------------------------- <div class="txt-block" itemprop="director" itemscope itemtype="http://schema.org/Person"> <h4 class="inline">Director:</h4> <a href="/name/nm0001104/?ref_=tt_ov_dr" itemprop='url'> <span class="itemprop" itemprop="name">Frank Darabont</span> </a> </div> ------------------------------------------------------------------------------- The snippet above is from the web page of the movie 'The Shawshank Redemption'. Args: div_element: pattern.web.Element instance representing a <div> containing names. Returns: A string with all the names in the <div>, semicolon separated if several. ''' names_list = [] for span in div_element.by_tag("span.itemprop"): name = unicode(plaintext(span.content)) names_list.append(name) return ";".join(names_list)
def research_on(self, what, where): url = URL( "https://www.pagesjaunes.fr/pagesblanches/recherche?quoiqui=" + what + "&ou=" + where + "&proximite=0") dom = DOM(url.download(cached=True)) for a in dom.by_tag("div.main-title pj-on-autoload "): for e in a.by_tag("span.denombrement"): number_of_results = int( self.decode_if_unicode(plaintext(e.content))[:3]) number_of_page_results = number_of_results / 20 if (number_of_results % 20 > 0): number_of_page_results += 1 self.exctract_values(dom, self.myInfo) for i in range(2, number_of_page_results + 1): url = URL( "https://www.pagesjaunes.fr/pagesblanches/recherche?quoiqui=" + what + "&ou=" + where + "&proximite=0+" "&page=" + str(i)) dom = DOM(url.download(cached=True)) self.exctract_values(dom, self.myInfo) self.myInfo.sort_and_merge()
def process_EN(): '''Processes the English RSS feeds, locate entities and returns a list of tuples URI, DICT, where DICT contains entry's title, URL, plain text and list of entities.''' EN_RSS_LIST = [ (u'Lifehacker', 'http://feeds.gawker.com/lifehacker/vip'), (u'The Verge', 'http://www.theverge.com/rss/index.xml'), (u'Zen Habits', 'http://feeds.feedburner.com/zenhabits?format=xml') ] items = [] for feed in EN_RSS_LIST: feedlist = [] # fetch the feed for result in reader.search(feed[1])[:10]: clean_text = plaintext(result.text) response = alchemyapi.entities('text', result.text) # parse the entities entities = [] for entity in response['entities']: if entity.has_key('disambiguated'): dbpedia_uri = entity['disambiguated']['dbpedia'] else: dbpedia_uri = None entities.append((entity['text'], dbpedia_uri)) feedlist.append(dict(title=result.title, url=result.url, text=clean_text, entities=entities)) items.append(dict(site=feed[0], feedlist=feedlist)) return items
def get_artist_docs(name): default_dir = basedir + name rap_docs = "" # get a list of all the files in default dir for f in os.listdir(default_dir): print f # go to that dir os.chdir(default_dir) # open the file fi = open(f, 'r') # print "reading " + f # slurp page = fi.read() # what does this do? dom = DOM(page) # we look at the page and get that the thing we want is in the .lyrics div. if dom and dom('.lyrics'): lyrics = dom('.lyrics')[0] else: continue p = plaintext(lyrics.content) rap_docs += p return rap_docs
def gettweets(searchterms): tweetlist = [] from pattern.web import Twitter, plaintext twitter = Twitter(language='en') for tweet in twitter.search(searchterms, cached=False): tweetlist.append(plaintext(tweet.text)) return tweetlist
def search_bbc(result, term, howmany, rurl, page): #convenient text only search if not rurl == None and 'www.bbc.co.uk' in rurl: query = rurl #for pagination support else: query = 'http://www.bbc.co.uk/search/news/'+term.lower()+'?text=on' dom = get_dom(query) if 'BBC' not in result: result['BBC'] = [] for a in dom.by_class('title'): title = plaintext(a.content) link = a.attributes['href'] content = heuristic_scrape(link) score = sentiment(content) if len(result['BBC']) < howmany: result['BBC'].append((title, 'NEG' if score < 0 else 'POS')) print 'BBC', title, link, 'NEG' if score < 0 else 'POS' if len(result['BBC']) % 20 == 0 and len(result['BBC']) < howmany: #20 articles per page print 'flip' next_page = 'http://www.bbc.co.uk/search/news/'+term+ \ '?page='+str(page+1)+'&text=on&dir=fd&news='+ \ str(len(result['BBC'])+1)+'&news_av=1' return search_bbc(result, term, howmany, next_page, page+1) else: break return result
def get_around_words(el, text, count=10): if el is None: return [] parent = el.parent before = [] after = [] max_depth = 10 source = '' idx = -1 while parent is not None and max_depth: source = plaintext(parent.source, linebreaks=1).lower().replace('\n', ' ') idx = source.find(text) after, before = get_words_before_after(source, after, before, idx, text) if len(before) >= count and len(after) >= count: break parent = parent.parent max_depth -= 1 # substitute date and time with recognizable tokens norm_source = re.sub(TIME_RE, 'TIME ', source) norm_source = re.sub(POINT_IN_TIME_RE, 'TIME ', norm_source) norm_source = re.sub(DATE_RE, 'DATE ', norm_source) after, before = get_words_before_after(norm_source, after, before, idx, text) return before[-count:], after[:count]
def __init__(self,url, query): super(documento, self).__init__() self.url = url self.urlObjet = URL('http://www.clips.ua.ac.be') self.html = self.urlObjet.download(user_agent='Mozilla/5.0') self.contenido = plaintext(self.html, keep=[], replace=blocks, linebreaks=2, indentation=False) self.elemento = Element(self.html)
def getWordList(url): word_list = [] #raw data #source_code = requests.get(url) #convert to text #plain_text = source_code.text #lxml format # soup = BeautifulSoup(plain_text,'lxml') htmlString = get(url).text webText = plaintext(htmlString) #find the words in paragraph tag #for text in webText.findAll('p'): #if text.text is None: # continue #content # content = text.text #lowercase and split into an array words = webText.lower().split() #for each word for word in words: #remove non-chars cleaned_word = clean_word(word) #if there is still something there if len(cleaned_word) > 0: #add it to our word list word_list.append(cleaned_word) return word_list
def get_elements_with_short_text(self, dom, tag_name, plain_text, title_idx): """ Get all potential candidates elements by tagName. Filter out elements with more than 9 words and with distance to title more than 300 characters """ elements = [] for el in dom.by_tag(tag_name): l = 0 el_plain_text = plaintext(el.source, keep=TAGS) title_dist = self.get_distance_to_title(tag_name, el_plain_text, plain_text, title_idx) is_valid = False for tdist in title_dist: if tdist < 300 and len(el_plain_text) < 300: is_valid = True break if not is_valid: continue for child in el.children: if issubclass(child.__class__, Text): l += len(filter(len, child.source.strip().split(' '))) if l > 9 or l == 0: break if l <= 9: elements.append(el) return elements
def search_cnn(result, term, howmany): RESULTS_URL = 'http://searchapp.cnn.com/cnn-search/query.jsp?query='+term+ \ '&ignore=article|mixed&start=1&npp='+str(howmany)+'|'+str(howmany)+'|'+str(howmany)+'&s=all&type=all'+ \ '&sortBy=date&primaryType=mixed&csiID=csi1' print "Getting CNN JSON blob" dom = get_dom(RESULTS_URL) print "Blob retrieved" #I do not understand this bizarre data structure for jscode in dom.by_id('jsCode'): search = json.loads(jscode.source()) result['CNN'] = [] results = search['results'] titles = [] for resultset in results: for article in resultset: title = plaintext(article['title']) link = article['url'] if 'http://' in link and not title in titles: #exludes video, dupes content = heuristic_scrape(link) score = sentiment(content) if len(result['CNN']) < howmany: result['CNN'].append((title, 'NEG' if score < 0 else 'POS')) titles.append(title) print 'CNN', title, link, 'NEG' if score < 0 else 'POS' else: print 'too many' break return result
def google_search(targetword, itemlist,targetpath): resultnum=0 engine = Google(license=None) file = codecs.open(targetpath,'a','utf-8') outtext= '' patt = ur'\W+' for item in itemlist: for i in range(1,5): for result in engine.search(item, type=SEARCH, start=i): url = URL(result.url) text = url.download(unicode=True) text = plaintext(text) text = correctPersianString(text) text = text.replace('\n',' ') lines = text.split('.') for line in lines: if targetword in line: match = re.findall(patt,line) output = ' '.join(match) for item in punclist: if item in line: line = line.replace(item,' ') print output file.write(output) file.write('\n') print str(resultnum)+" found in web" file.close()
def plain(text): ''' Formats text as plaintext with utf-8. ''' return unicodedata.normalize('NFKD', plaintext(text.encode('utf-8'))).encode( 'ascii', 'ignore')
def obtain_data(url): ''' Scrape the Wikipedia page. Args: url: pattern.web.URL instance pointing to the Wikipedia page Returns: A list of lists, where each sublist represents a data point. Each sublist contains two elements: a string with the name of the country, and a string with the size of the population of that country. ''' # Create a DOM of the URL. html = url.download(cached=True) dom = DOM(html) data_points = [] for countries_table in dom.by_tag("table.wikitable sortable"): for table_row in countries_table.by_tag("tr")[1:]: # The first row is the header, so start at index 1. table_row_content = [] # Obtain the content of the row. for table_row_cell in table_row.by_tag("td"): table_row_cell_content = unicode(plaintext(table_row_cell.content)) table_row_content.append(table_row_cell_content) # Obtain the country name and the population size. country = table_row_content[1].split("[")[0].split(" (")[0] population = "".join(table_row_content[2].split(",")) data_point = [country, population] data_points.append(data_point) return data_points
def _parse_item(self, item_element): item = {} for element_name in ['title', 'url', 'description', 'content']: element = item_element.find( self._item_paths[self.feed_type][element_name]) if element is not None and element.text is not None: item[element_name] = plaintext(element.text) else: item[element_name] = None pubdate_element = item_element.find( self._item_paths[self.feed_type]['pubdate']) if pubdate_element is not None: item['pubdate'] = dateparser.parse(pubdate_element.text) else: item['pubdate'] = None enclosure_element = item_element.find( self._item_paths[self.feed_type]['enclosure']) if enclosure_element is not None: item['enclosure'] = enclosure_element.attrib else: item['enclosure'] = None return item
def process_ES(): '''Processes the Spanish RSS feeds, locate entities and returns a list of tuples URI, DICT, where DICT contains entry's title, URL, plain text and list of entities.''' ES_RSS_LIST = [ (u'Menéame', 'http://meneame.feedsportal.com/rss'), (u'Naukas', 'http://feeds.feedburner.com/naukas'), (u'Yuri', 'http://www.lapizarradeyuri.com/feed/') ] items = [] # fetch the feed for feed in ES_RSS_LIST: feedlist = [] for result in reader.search(feed[1])[:10]: clean_text = plaintext(result.text).encode('utf-8') params = urllib.urlencode({ 'key': '4c4ded0a7c279c9f747a8f750e223363', # topic extraction 'of': 'json', 'lang': 'es', 'txt': clean_text, 'tt': 'a', 'dm': '5' }) response = json.loads(urllib2.urlopen(url, params).read()) # parse the entities entities = [] if response['status']['msg'] == 'OK': for e in response['entity_list']: if e.has_key('semld_list'): for uri in e['semld_list']: if 'es.wikipedia' in uri: entities.append((e['form'], uri)) break feedlist.append(dict(title=result.title, url=result.url, text=clean_text, entities=entities)) items.append(dict(site=feed[0], feedlist=feedlist)) return items
def htmlSearch(self, html, url): logger.debug(u"htmlSearch URL : %s" % url) logger.debug(u"html : %s" % html[:20]) s = html.lower() s = plaintext(s) s = parsetree(s) # self.logSentences(s) # Execute a Regular Expression Search p = r'(NN)+' q = search(p, s) # self.logPOS(q) # Iterate over all the words in the POS logger.debug(u" q.Length=%d" % len(q)) logger.debug(u" q[]=%s" % q) self.g, self.urlConcepts, self.wordConcepts = self.addNodes(self.g, q, url, self.urlConcepts, self.wordConcepts) return self.urlConcepts, self.wordConcepts
def summarize(query=None, k=4, url=None): j = [] if url: b = URL(url) a = Document(b.download(cached=True)) for b in a.get_elements_by_tagname("p"): j.append(plaintext(b.content).encode("utf-8")) j = [ word for sentence in j for word in sentence.split() if re.match("^[a-zA-Z_-]*$", word) or '.' in word or "'" in word or '"' in word ] j = ' '.join(j) lsa1 = LSA(stopwords, ignore_characters) sentences = j.split('.') sentences = [ sentence for sentence in sentences if len(sentence) > 1 and sentence != '' ] for sentence in sentences: lsa1.parse(sentence) else: lsa1 = LSA(stopwords, ignore_characters) sentences = query.split('.') for sentence in sentences: lsa1.parse(sentence) lsa1.build() lsa1.calc() summary = [(sentences[i], norm(dot(diag(lsa1.S), lsa1.Vt[:, b]), 2)) for i in range(len(sentences)) for b in range(len(lsa1.Vt))] sorted(summary, key=itemgetter(1)) summary = dict( (v[0], v) for v in sorted(summary, key=lambda summary: summary[1])).values() return '.'.join([a for a, b in summary][len(summary) - (k):])
def get_candidates(self, html): dom = DOM(html) if not dom.body: return [] # feature: text length # filter out long blocks of text plain_text = plaintext(dom.body.source, keep=TAGS) title_idx = self.get_title_index(dom, plain_text) candidates = [] for tag in TAGS: elements = self.get_elements_with_short_text(dom, tag, plain_text, title_idx) for el in elements: # looking for username words = get_words(el) # generate bi- and tri-grams text = ' '.join(words) bigrams = ngrams(text, 2) trigrams = ngrams(text, 3) for t in (bigrams+trigrams): s = ' '.join(t) candidates.append(Candidate(el, dom, s, plain_text, title_idx)) print 'Candidates found %s' % len(candidates) return candidates
def test_plaintext(self): # Assert plaintext: # - strip <script>, <style>, <form>, <!-- --> elements, # - strip tags, # - decode entities, # - collapse whitespace, html = """ <html> <head> <title>tags & things</title> </head> <body> <div id="content"> \n\n\n\ <!-- main content --> <script type="text/javascript>"alert(0);</script> <h1>title1</h1> <h2>title2</h2> <p>paragraph1</p> <p>paragraph2 <a href="http://www.domain.com" onclick="alert(0);">link</a></p> <ul> <li>item1 xxx</li> <li>item2</li> <ul> </div> <br /> <br /> </body> </html> """ self.assertEqual(web.plaintext(html, keep={"a": "href"}), u"tags & things\n\ntitle1\n\ntitle2\n\nparagraph1\n\nparagraph2 " + \ u"<a href=\"http://www.domain.com\">link</a>\n\n* item1 xxx\n* item2") print "pattern.web.plaintext()"
def summarize(query=None, k=4,url=None): j = [] if url: b = URL(url) a = Document(b.download(cached=True)) for b in a.get_elements_by_tagname("p"): j.append(plaintext(b.content).encode("utf-8")) j = [word for sentence in j for word in sentence.split() if re.match("^[a-zA-Z_-]*$", word) or '.' in word or "'" in word or '"' in word] j = ' '.join(j) lsa1 = LSA(stopwords, ignore_characters) sentences = j.split('.') sentences = [sentence for sentence in sentences if len(sentence)>1 and sentence != ''] for sentence in sentences: lsa1.parse(sentence) else: lsa1 = LSA(stopwords, ignore_characters) sentences = query.split('.') for sentence in sentences: lsa1.parse(sentence) lsa1.build() lsa1.calc() summary =[(sentences[i], norm(dot(diag(lsa1.S),lsa1.Vt[:,b]),2)) for i in range(len(sentences)) for b in range(len(lsa1.Vt))] sorted(summary, key=itemgetter(1)) summary = dict((v[0],v) for v in sorted(summary, key=lambda summary: summary[1])).values() return '.'.join([a for a, b in summary][len(summary)-(k):])
def summarize_evaluation(query=None, url=None, summary=None): j=[] if url: b = URL(url) a = Document(b.download(cached=True)) for b in a.get_elements_by_tagname("p"): j.append(plaintext(b.content).encode("utf-8")) j = [word for sentence in j for word in sentence.split() if re.match("^[a-zA-Z_-]*$", word) or '.' in word or "'" in word or '"' in word] j = ' '.join(j) lsa = LSA(stopwords, ignore_characters) sentences = j.split('.') sentences = [sentence for sentence in sentences if len(sentence)>1 and sentence != ''] for sentence in sentences: lsa.parse(sentence) else: lsa = LSA(stopwords, ignore_characters) for sentence in query: lsa.parse(sentence) lsa.build() lsa.calc() lsa2 = LSA(stopwords, ignore_characters) for sentence in summary: lsa2.parse(sentence) lsa2.build() lsa2.calc() vectors =[(dot(lsa.S,lsa.U[0,:]),dot(lsa.S,lsa.U[i,:])) for i in range(len(lsa.U))] vectors2 =[(dot(lsa2.S,lsa2.U[0,:]),dot(lsa2.S,lsa2.U[i,:])) for i in range(len(lsa2.U))] angles = [arccos(dot(a,b)/(norm(a,2)*norm(b,2))) for a in vectors for b in vectors2] return str(abs(1 - float(angles[1])/float(pi/2)))
def extract_tvseries(dom): ''' Extract a list of highest rated TV series from DOM (of IMDB page). Each TV series entry should contain the following fields: - TV Title - Rating - Genres (comma separated if more than one) - Actors/actresses (comma separated if more than one) - Runtime (only a number!) ''' # ADD YOUR CODE HERE TO EXTRACT THE ABOVE INFORMATION ABOUT THE # HIGHEST RATED TV-SERIES # NOTE: FOR THIS EXERCISE YOU ARE ALLOWED (BUT NOT REQUIRED) TO IGNORE # UNICODE CHARACTERS AND SIMPLY LEAVE THEM OUT OF THE OUTPUT. shows = [] showinfo = {} # gets the title, rating, genre, actors and runtime, puts those in a dict # and puts the dict in an array for e in dom.by_tag("div.lister-item-content"): # gets the actor names for a in e.by_tag("p"): if "Stars" in a.content: info = plaintext(a.content).encode("utf-8").strip("Stars:") # gets the title, rating, genre and runtime title = plaintext((e.by_tag("a")[:1])[0].content) rating = plaintext((e.by_tag("strong"))[0].content) genre = plaintext((e.by_class("genre"))[0].content) actors = info runtime = plaintext((e.by_class("runtime"))[0].content.strip("min")) # creates a dict with the different values showinfo = { "title": title, "rating": rating, "genre": genre, "actors": actors, "runtime": runtime, } # appends the info into the shows array shows.append(showinfo) # replace this line as well as appropriate return shows