Example #1
0
def scrape_links_and_wordlistify(links, lower=False, verbose=1):
    import nltk
    import requests
    import string
    raw = ''
    wordlist = {}
    for site in links:
        try:
            if verbose == 1:
                print '[+] fetching data from: ', site
            if site.find('http://pastebin.com/') == 0:
                raw = requests.get(site.replace('http://pastebin.com/', 'http://pastebin.com/raw.php?i=')).content
            else:
                raw = requests.get(site).content
            if lower == False:
                l = string.translate(nltk.clean_html(raw), string.maketrans(string.punctuation, ' ' * 32)).split()
                freq_an(l, wordlist)
            else:
                l = string.lower(nltk.clean_html(raw))
                l = string.translate(l, string.maketrans(string.punctuation, ' ' * 32)).split()
                freq_an(l, wordlist)
        except:
            if verbose == 1:
                print '[-] Skipping url: ', site
    return wordlist
Example #2
0
 def extract_content(self,raw):
   logging.info('Processor.extract_content')
   
   soup = BeautifulSoup(raw)
   cable_table = soup.find("table", { "class" : "cable" })
   cable_id = cable_table.findAll('tr')[1].findAll('td')[0]\
     .contents[1].contents[0]
   if db.cables.find_one({'_id':cable_id}):
     self.counts['files_not_processed'] = self.counts['files_not_processed'] + 1
     logging.info('Processor.extract_content["CABLE ALREADY EXISTS"]')
     self.print_counts()
     return
     
   cable = Cable(raw)
   cable['_id'] = cable_id
   cable['reference_id'] = cable_id
   cable['date_time'] = cable_table.findAll('tr')[1].findAll('td')[1]\
     .contents[1].contents[0]
   cable['classification'] = cable_table.findAll('tr')[1].findAll('td')[2]\
     .contents[1].contents[0]
   cable['origin'] = cable_table.findAll('tr')[1].findAll('td')[3]\
     .contents[1].contents[0]
   cable['header'] = nltk.clean_html(str(soup.findAll(['pre'])[0]))
   cable['body'] = nltk.clean_html(str(soup.findAll(['pre'])[1]))
   
   db.cables.insert(cable.get())
   
   self.counts['files_processed'] = self.counts['files_processed'] + 1
   
   self.print_counts()
   
   if (self.counts['files_processed'] + self.counts['files_not_processed'])\
     == self.counts['files_to_process']:
     self.dump_json()
Example #3
0
    def process_feed(self, entries):
        abbr = self.abbr
        feed_entries = db.feed_entries
        third = itemgetter(2)

        # Find matching entities in the feed.
        for entry, matches in self.scan_feed(entries):                    
            matches = self.extract_entities(matches)

            ids = map(third, matches)
            strings = [m.group() for m, _, _ in matches]
            assert len(ids) == len(strings)

            # Add references and save in mongo.
            
            entry['state'] = abbr # list probably wiser
            entry['entity_ids'] = ids or None
            entry['entity_strings'] = strings or None
            entry['save_time'] = datetime.datetime.utcnow()
            entry['_id'] = new_feed_id(entry)
            entry['_type'] = 'feedentry'

            entry['summary'] = nltk.clean_html(entry['summary'])
            try:
                entry['summary_detail']['value'] = nltk.clean_html(
                    entry['summary_detail']['value'])
            except KeyError:
                pass
            
            feed_entries.save(entry)
            msg = 'Found %d related entities in %r'
            self.logger.info(msg % (len(ids), entry['title']))
Example #4
0
def scrapeBlog(url, depth): # obs hackkkkkkkkk
    allText = ""
    pages = getPages(url)
    pages = pages[(depth+1):] # take the rest
    posts = []
    timestamps = []
    
    for url in pages:
        response = getContent(url)
        repls = ('januari', 'january'), ('februari', 'february'), ('mars', 'march'), ('maj', 'may'), ('juni', 'june'), ('juli', 'july'), ('augusti', 'august'), ('oktober', 'october')
        response = reduce(lambda a, kv: a.replace(*kv), repls, response.lower())
        
        soup = BeautifulSoup(response)
        
        
        try:
            poststext = soup.select(".blogposttext") # get posts text
            poststext = [nltk.clean_html(unicode(post)) for post in poststext]
            postsdatetime = soup.select(".blogpostheaderdate")
            
            postsdatetime = [nltk.clean_html(unicode(post)) for post in postsdatetime]
            postsdatetime = [parse(post, fuzzy=True) for post in postsdatetime]
            
            posts.extend(poststext[0:len(postsdatetime)])
            timestamps.extend(postsdatetime)
        except:
            pass
        #allText = allText + "\n\n" + getAllText(url)
    
    return posts, timestamps
Example #5
0
    def parse(self, fname):
        try:
            with open(fname, "r") as f:
                log.info("Process %s" % fname)
                soup = BeautifulSoup(f.read())
                tbl = soup.find("table", { "class" : "cable" })
                docid = tbl.findAll('tr')[1].\
                        findAll('td')[0].contents[1].contents[0]

                if docid in self.docids:
                    return True

                doc = {
                        "_id": docid,
                        "refererence_id": docid,
                        "date_time": tbl.findAll('tr')[1].\
                                findAll('td')[1].contents[1].contents[0],
                        "classification": tbl.findAll('tr')[1].\
                                findAll('td')[2].contents[1].contents[0],
                        "origin": tbl.findAll('tr')[1].\
                                findAll('td')[3].contents[1].contents[0],
                        "header":nltk.clean_html(str(soup.findAll(['pre'])[0])),
                        "body": nltk.clean_html(str(soup.findAll(['pre'])[1]))
                }
                
                return doc

        except OSError:
            log.error("Can't open '%s'" % fname)
            self.processed -= 1
def getKeyList(testID):
    myDataQ = getData(testID,1)
    myDataA = getData(testID,0)

    userKeyQ = getUserAnnotate(myDataQ)
    userKeyA = getUserAnnotate(myDataA)

    myCodeListQ = getCodeList(myDataQ)
    myCodeListA = getCodeList(myDataA)
    myHtml = getHTML(testID)
    
    t1 = []
    packQ = []
    funcQ = []
    for item in myCodeListQ:
        try:
            p,f = cparPack(nltk.clean_html(item))
            packQ += p 
            funcQ += f
        except SyntaxError:
            pass
        t1 += preProCode(item)
    fQ,aQ,vQ,cQ = cparFuncs(t1) 
    packQ,funcQ = cparPack(t1)
    fQ = list(set(fQ))
    aQ = list(set(aQ))
    vQ = list(set(vQ))
    cQ = list(set(cQ))

    combQ = []
    for cItem in cQ:
        for fItem in fQ:
            combQ.append(cItem+"."+fItem) 

    t2 = []
    packA = []
    funcA = []
    for item in myCodeListA:
        try:
            p,f = cparPack(nltk.clean_html(item))
            packA += p 
            funcA += f
        except SyntaxError:
            pass
        t2 += preProCode(item)
    fA,aA,vA,cA = cparFuncs(t2) 
    fA = list(set(fA))
    aA = list(set(aA))
    vA = list(set(vA))
    cA = list(set(cA))

    combA = []
    for cItem in cA:
        for fItem in fA:
            combA.append(cItem+"."+fItem) 

    keyList = \
    list(set(fQ+fA+aQ+aA+vQ+vA+cQ+cA+combQ+combA+packQ+packA+funcQ+funcA+userKeyQ+userKeyA))

    return keyList
Example #7
0
  def parse_file(self, filepath):
    """
    Parses a corpus file and initialize the object.
    
    @param  filepath: The path of the corpus file to parse.
    @type   filepath: C{string}
    """

    html_file = codecs.open(filepath, "r", "utf-8")
    raw_html = html_file.read()
    body = raw_html.split("<body>",1)[1]
    raw_content = nltk.clean_html(body.split("</h1>", 1)[1])

    self.set_title(nltk.clean_html(body.split("</h1>", 1)[0]).strip() + ".")
    
    content = ""
    for p in raw_content.split("\n"):
      p = p.strip()

      if p != "":
        if content != "":
          content += " "
        content += p
    content = content.split("-", 1)[1].replace(u"\u202F", " ").strip()

    self.set_content(content)

    html_file.close()
Example #8
0
def getarticle(url):
    html = urllib2.urlopen(url)
    soup = BeautifulSoup(html,from_encoding="utf-8")
    titletag = soup.find("h2")
    title = nltk.clean_html("{0}".format(titletag))
    ptags = soup.find_all("p")
    text = nltk.clean_html("{0}".format(ptags[2]))
    return title,text
Example #9
0
def getarticle(url):
    html = urllib2.urlopen(url)
    soup = BeautifulSoup(html,from_encoding="utf-8")
    titletag = soup.find("h2")
    title = nltk.clean_html("{0}".format(titletag))
    storytag = soup.findAll('div',{'class':None})[1]
    text = nltk.clean_html("{0}".format(storytag))
    return title,text
Example #10
0
    def extrait(self, rss):
	d = feedparser.parse(rss)
	h = random.randint(0, len(d['entries']) -1)
	print h
	print str(len(d['entries']))
	titre = nltk.clean_html(d['items'][h].title)
	descriptionb = nltk.clean_html(d['items'][h].description)
	description = re.sub("&#(\d+);", lambda m: chr(int(m.group(1))), descriptionb)
	return titre+". \n\n"+description
Example #11
0
def preprocess_hotel_review(file_contents, file_contents_test):
    """
    Hotel review preprocess and truthfulness of the hotel review
    :param file_contents:
    :param file_contents_test:
    """
    raw = clean_html(file_contents)
    raw = re.sub(r'IsTruthFul,IsPositive,review', "", raw)
    sentence_list = tokenize.line_tokenize(raw)
    print sentence_list
    truth_sentences = []
    false_sentences = []
    for sentence in sentence_list:
        sent_arr = re.split(r',', sentence)
        try:
            is_truthful = int(sent_arr[0])
        except ValueError:
            print "is_truthful is not an integer"

        if is_truthful == 1:
            truth_sentences.append(sent_arr[2])
        elif is_truthful == 0:
            false_sentences.append(sent_arr[2])

    truth_uni_prob_dict, truth_bi_prob_dict = process_prob(" ".join(truth_sentences))
    false_uni_prob_dict, false_bi_prob_dict = process_prob(" ".join(false_sentences))

    raw_test = clean_html(file_contents_test)
    raw_test = re.sub(r'IsTruthFul,review', "", raw_test)
    sentence_list_test = tokenize.line_tokenize(raw_test)
    test_list = []
    test_truth_false_list = []
    truth_count = false_count = i = 0
    for sentence in sentence_list_test:
        sent_arr = re.split(r',', sentence)
        truth_uni_perplex, truth_bi_perplex = perplexity(sent_arr[1], truth_uni_prob_dict, truth_bi_prob_dict)
        false_uni_perplex, false_bi_perplex = perplexity(sent_arr[1], false_uni_prob_dict, false_bi_prob_dict)
        test_list.append((sent_arr[1], truth_bi_perplex, false_bi_perplex))
        truth_or_false = 1 if truth_bi_perplex < false_bi_perplex else 0
        #truth_or_false = 1 if truth_uni_perplex < false_uni_perplex else 0
        if truth_or_false:
            truth_count += 1
        else:
            false_count += 1
        test_truth_false_list.append([i, truth_or_false])
        i += 1

    import csv

    with open("kaggle_sharp.csv", "wb") as f:
        writer = csv.writer(f)
        writer.writerows([['Id', 'Label']])
        writer.writerows(test_truth_false_list)
    print test_list
    print test_truth_false_list
    print truth_count
    print false_count
Example #12
0
	def __init__(self,directory):
		#get list of all tags that can be simplified into synonym tags
		stf = open(directory+"tags_synonym.csv", 'r') #converting each tag to its hypernym
		rdr= csv.reader(stf)
		for r in rdr:  
			#r[0]=tag  r[1]=tag it should be replaced with
			self.synonym_tags[r[0]]=r[1]
		stf.close()

		tf=open(directory+"tags.csv", 'r') #assign wieght for tag for each tag
		rdr=csv.reader(tf)
		for r in rdr:
			tmp=r[0].split(';') #tmp[0]=tag      tmp[1]=frequency
			self.tags[tmp[0]]=float(1/float(tmp[1]))
		tf.close()

		for tmp in self.tags:
			t=tmp.split('-')
			if len(t)>1:
				t2=tmp.replace('-',' ')
				#print t2
				if t[0] not in self.complex_tags:
					self.complex_tags[t[0]]=[]

				self.complex_tags[t[0]].append(t2)
				#self.complex_tags_replacements[t[0]]=tmp
				self.complex_tags_replacements[t2]=tmp

		qf=open(directory+"Questions&Answers&Tags.csv",'r')
		rdr=csv.reader(qf)
		for r in rdr: #r[0]:question title r[1]=question title r[2]: best answer r[3]: tags
			if r[0][len(r[0])-1] not in ['!','?','.']:
				r[0]=r[0]+'.'
			r[1]=nltk.clean_html(r[1])
			r[2]=nltk.clean_html(r[2])
			r[0]=r[0]+' '+r[1]
			self.questions.append(r[0])
			self.answers.append(r[1])
			n=len(self.questions)-1
			r[3]=r[3].replace('<','')
			r[3]=r[3].replace('>',' ')
			tmplist=r[3].split(' ')
			for t in tmplist:
				if t in self.synonym_tags:
					r[3]=r[3].replace(t,self.synonym_tags[t])

			tmplist=r[3].split(' ')
			tmplist.pop()
			self.tagsInQuestions[n]=tmplist
			for t in tmplist:
				if t not in self.questionsForTags:
					self.questionsForTags[t]=[]
				self.questionsForTags[t].append(n)

		qf.close()
Example #13
0
def index():
  steps = Step.query.order_by(Step.num_de_paso)
  for step in steps:
    if step.tipo_de_tramite:
      step.tipo_de_tramite = clean_html(step.tipo_de_tramite)
    if step.requisitos:
      step.requisitos = clean_html(step.requisitos)
    if step.consideraciones:
      step.consideraciones = clean_html(step.consideraciones)
    if step.preguntas_frecuentes:
      step.preguntas_frecuentes = clean_html(step.preguntas_frecuentes)
  return render_template('index.html', steps=steps)
Example #14
0
def autos_us():
    html = open('autos-us.html').read()
    soup = BeautifulSoup(html)
    first = soup.find('li').contents[0]
    second = first.parent.next_sibling.next_sibling.contents[0]
    third = second.parent.next_sibling.next_sibling.contents[0]
    majors = [first, second, third]
    minors = soup.select('ul li ul li')
    major_tokens = [nltk.clean_html(str(w)) for w in majors]
    minor_tokens = [nltk.clean_html(str(w)) for w in minors]
    minor_tokens = [re.sub(r'\s\([\S\s]+\)|\[\s\S\s\]|\n\s[A-Za-z]+', r'', token) for token in minor_tokens]
    tokens = list(set(major_tokens + minor_tokens))
    return tokens
Example #15
0
def gasPrices(origin, destination):
	one_way_cost = ''
	from_address = origin
	to_address = destination
	new_from_address = from_address.replace(" ", "+")
	new_to_address = to_address.replace(" ", "+")
	url = "http://www.travelmath.com/cost-of-driving/from/" + new_from_address + "/to/" + new_to_address
	html = urllib.urlopen(url)
	for line in html:
		if "costofdriving" and "$" in line:
			one_way_cost = nltk.clean_html(line.split("one-way")[0].replace("$", ""))
			round_trip_cost = nltk.clean_html(line.split("one-way")[1].replace("round trip", "").replace("$", "")).replace('/ ', "")
			break
	return one_way_cost
Example #16
0
def invent_ext(htmlString):
    start = htmlString.find("Inventors:")
    end = htmlString.find("Assignee:")
    end2 = htmlString.find("Appl. No.:")
    if start == -1:
        extract = "No Inventors Listed"
    else:
        if end == -1:
            extract = htmlString[start+11:end2]
            extract = nltk.clean_html(extract)
        else:
            extract = htmlString[start+11:end]
            extract = nltk.clean_html(extract)
    
    return extract
def webUrl(fullUrl):
    #urllib2 works best with a specific url format
    validUrl = re.compile(
        r'^(?:http)s?://|' # http:// or https://
        r'^(?:http)s?://www.'
        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
        r'localhost|' #localhost...
        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # or ip
        r'(?::\d+)?' # optional port
        r'(?:/?|[/?]\S+)$', re.IGNORECASE)

    if validUrl.match(fullUrl):
        finalList = []
        urlInput = quote(fullUrl, safe="%/:=&?~#+!$,;'@()*[]")
        urlInput = urlInput.strip('%0A')
        try:
            u = urlopen(urlInput)
            html = u.read()
            raw = nltk.clean_html(html)
            tokens = nltk.word_tokenize(raw)
            if args.minLength or args.maxLength:
                for token in tokens:
                    if not(len(token.translate(None,charBlacklist)) < minl or len(token) > maxl):
                        wordList.append(str(token).translate(None,charBlacklist))
            else:
                for token in tokens:
                    wordList.append(str(token).translate(None,charBlacklist))
            print "Scraping URL - {0}".format(fullUrl)
        except Exception as e:
            print 'There was an error connecting to or parsing {0}'.format(fullUrl)
            print 'Error: %s' % e
    else:
        print 'INVALID URL - {0}. Format must be http(s)://www.smeegesec.com.'.format(fullUrl)
Example #18
0
def Create_index_from_url( url, depth ):
    if depth > MAX_DEPTH:
        return []
    url_queue = Queue()
    url_queue.put( url )
    checked = []

    IndexGen = Index_Generator()
    while not url_queue.empty() :

        current_url = url_queue.get()

        checked.append( current_url )

        try:
            html = Get_page( current_url )
        except:
            print "Exception"
            continue
        if depth > 0:
            for link in Link_generator( html ):
                #print link
                if link not in checked:
                    url_queue.put( link )
            depth = depth - 1

        html = nltk.clean_html( html )
        IndexGen.gen_url_index( current_url, html )
        result_index = {}
        result_index = IndexGen.get_index_dict()
        for key in result_index:
            result_index[key].sort()

    return result_index
Example #19
0
def counts_pages_words(url):
	source = requests.get(url)
	clean = nltk.clean_html(source.text)
	tokens = nltk.word_tokenize(clean) #can make class nltk.text.Text object out of tokens
	tokens = [word for word in tokens if word.lower() not in sw]
	freqdist = nltk.FreqDist(tokens)
	return { "title": source.url.title, "freq_dist": freqdist.items() }
Example #20
0
    def retrieve_editorial(self, a_url):

        editorial =[]
        # Open URL object
        print a_url, " < url"

        try:

            contents = self.url_read(a_url)

            para_ct = 0
            for para in re.finditer(r'<p>(.*?)</p>', contents, re.DOTALL):
                try:
                    para = para.groups()[0]
                    if dbg: print "para ", len(para)
                    para_ct += len(para)
                    cleaned = nltk.clean_html(para)
                    self.toks = cleaned.split()
                    # self.toks  = nltk.word_tokenize(cleaned)
                    self.toks = [it.lower() for it in self.toks]
                    self.remove_punctuation()
                    if dbg: print(self.toks)
                    editorial.extend(self.toks)
                except Exception, e:
                    print para
                    print e



            print para_ct, 'symbols'
Example #21
0
File: eval.py Project: nbir/544nlp
def test3():
	import nltk
	from nltk.corpus import conll2000
	from urllib import urlopen

	fname = 'data/dummy/webpages/Abby_Watkins/raw/002/index.html'
	doc = urlopen(fname).read()
	raw = nltk.clean_html(doc)

	decoded = raw.decode('utf-8', errors='ignore')
	raw = decoded.encode('utf-8')
	print raw

	sentences = nltk.sent_tokenize(raw)
	sentences = [s.replace('\n', '').replace('\r', '').strip() for s in sentences]
	sentences = [nltk.word_tokenize(s) for s in sentences]
	sentences = [nltk.pos_tag(s) for s in sentences]
	#porter = nltk.PorterStemmer()
	#sentences = [[(porter.stem(w[0]), w[1]) for w in s] for s in sentences]
	#sentences = [[w[0] for w in s] for s in sentences]
	#sentences = [['%s_%s' % w for w in s] for s in sentences]


	lexicon = []
	#for s in sentences:
		#print len(s)
		#for w in s:
		#	print w[0]
		#print ' '.join(w[0] for w in s)
		#print nltk.ne_chunk(s, binary=True)

		#lexicon.extend(s)
	fdist = nltk.FreqDist(lexicon)
Example #22
0
def get_xmen_text(soup):
    
    #en_stopwords = set(nltk.corpus.stopwords.words('english'))
    raw = nltk.clean_html(str(soup))
    raw_trunc = raw[:raw.rfind('References')]
    sents = nltk.sent_tokenize(raw_trunc)
    words = [nltk.word_tokenize(sent) for sent in sents]
    poss = [nltk.pos_tag(word) for word in words]
    #nes = [nltk.ne_chunk(pos, binary=True) for pos in poss]
    #for pos in poss: print pos
    poss_filter = [filter_insignificant(pos, tag_suffixes=['DT']) for pos in poss]
    print poss_filter
    nes = [nltk.ne_chunk(poss_filter, binary=True) for pos in poss_filter]
    
    def sub_leaves(tree, node):
        return [t.leaves() for t in tree.subtrees (lambda s: s.node == node)]
    
    people = [sub_leaves(ne, 'NE') for ne in nes]
    people = [item for sublist in people
              for subsublist in sublist
              for subsubsublist in subsublist
              for item in subsubsublist
              if item not in ('NNP', 'NN', 'NNPS', 'JJ')]
    people = merge_people(people)
    fd = nltk.FreqDist(person for person in people if person!='Magneto')
    fd.plot(50)
Example #23
0
def fetchWebPAge(url):
	try:
		req = urllib2.Request(url)
		response = urllib2.urlopen(req,timeout=1)
		content = response.read()
		urlDownloaded.append(url)
		print url
		soup = BeautifulSoup(content)
		raw = nltk.clean_html(content)
		tokenized = nltk.word_tokenize(raw)
		priority = get_priority(url,content,tokenized)

		br = mechanize.Browser()

		br.open(url)

		for link in br.links():
			new_url = urlparse.urljoin(link.base_url,link.url)
			base = urlparse.urlparse(new_url).hostname
			path = urlparse.urlparse(new_url).path
			finalUrl = "http://"+base+path
			if not pat.search(finalUrl):
				if finalUrl not in urlVisited:
					urlQueue.put(finalUrl)
					urlVisited.append(finalUrl)

	except socket.timeout, e:
	    raise MyException("[TIMEOUT ERROR]:: %r" % e)
Example #24
0
def obtenerNoticias():
    # Retorno
    todas_noticias = []

    # Procesamos fuentes
    for fuente in fuentes:
        resultado = Noticias()
        todas_noticias.append(resultado)
        # Descargamos la fuente
        noticias = feedparser.parse(fuente)
        titulo_fuente = noticias['feed']['title']

        # Recorremos noticias
        for noticia in noticias['entries']:
            noticia_titulo =  noticia['title'].encode('UTF-8', 'replace')
            noticia_resumen = nltk.clean_html(noticia['summary']).encode('UTF-8', 'replace')
            noticia_enlace = noticia['link'].encode('UTF-8', 'replace')

            resultado.insertar(noticia_titulo, noticia_resumen, noticia_enlace)

    resultado = Noticias()

    longitud = 0

    for fuente in todas_noticias:
      longitud = max(longitud, fuente.longitud())

    for i in range(longitud):
      for fuente in todas_noticias:
        if (i >= fuente.longitud()):
          continue

        resultado.insertar(fuente.obtener(i)[0], fuente.obtener(i)[1], fuente.obtener(i)[2])

    return resultado
Example #25
0
def format_text_for_NER(raw_text, social_web_platform=None):
    """ Prepares the given text for named entity extraction. Minimal 
    processing performed in order to remove line breaks, links, etc
    rather than more substantial formatting like porting or stemming that
    would interfere with a NER toolkit's ability to recognize entities. """
    
    ''' remove line breaks '''
    cleaned_text = raw_text.replace('\r\n', ' ').replace('\n', ' ').replace('\r', ' ') 
    
    ''' remove html '''
    cleaned_text = nltk.clean_html(cleaned_text)
    
    ''' remove links (www.* or http*) '''
    cleaned_text = re.sub('((www\.[\s]+)|(https?://[^\s]+))','', cleaned_text)
    
    ''' replace double quotes with single quotes to avoid a Wikipedia Miner error '''
    cleaned_text = cleaned_text.replace("\"", "\'")

    ''' remove non-printable characters '''
    cleaned_text = filter(lambda x: x in string.printable, cleaned_text) 
    
    ''' clean any social web platform specific text '''
    if social_web_platform != None:
        cleaned_text = social_web_platform.clean_text(cleaned_text)
    
    ''' remove misc. remnant strings we don't care about '''
    words_manually_filter = []
    cleaned_text = ' '.join([word for word in cleaned_text.split() 
                             if not word in words_manually_filter])
    
    return cleaned_text
Example #26
0
 def parse_detail(self,response):
       item = TbsItem()
       headers = response.headers
       self.set_items_value(item,'character',self.get_page_character(response.body))
       self.set_items_value(item,'crawl_stats',self.default_crawl_stats)
       self.set_items_value(item,'searchkeywords',self.keyword)
       self.set_items_value(item,'spiderid',self.name)
       self.set_items_value(item,'refer',response.meta['refer'])
       self.set_items_value(item,'url_hash_no_fragment', self.get_url_hash_no_fragment(response.url))
       self.set_items_value(item,'url', self.parseurl(response.url))
       self.set_items_value(item,'root_domain',urlparse(response.url).hostname)
       self.set_items_value(item,'Expires',self.to_GMT_timestamp(headers['Expires']) if 'Expires' in headers.keys() else self.to_GMT_timestamp(None))
       self.set_items_value(item,'LastModified',self.to_GMT_timestamp(headers['Last-Modified']) if 'Last-Modified' in headers.keys() else self.to_GMT_timestamp(None))
       try:
            hxs = HtmlXPathSelector(response)
            self.set_items_value(item,'title',','.join(hxs.select('//title/text()').extract()))
            self.set_items_value(item,'desc',','.join(hxs.select('//meta[@name="description"]/@content').extract()))
            self.set_items_value(item,'keyword',','.join(hxs.select('//meta[@name="keywords"]/@content').extract()))
       except:
            self.set_items_value(item,'title',' ')
            self.set_items_value(item,'desc',' ')
            self.set_items_value(item,'keyword',' ')
       self.set_items_value(item,'body',response.body)
       self.set_items_value(item,'stripedbody',nltk.clean_html(self.strip_body(response.body)))
       return item
Example #27
0
def create_feedset(feed_seq):
   ''' Call on Google Reader with subscription request
   and create a set of (title, link) pairs: a Feed Set '''
   import nltk
#   import pickle
#   f = open('/home/crc/tmp/apollo.pkl', 'rb')
#   feedset = pickle.load(f)
#   f.close()

   pat = re.compile('http://.*$')
   feedset = []
   for eachfeed in feed_seq:
      feed_str = eachfeed.id()
      result = pat.search(feed_str)
      if result is not None:
         logging.info("Refreshing %d from '%s'...", 
                      eachfeed.unread_count(), 
                      result.group())
      eachfeed.refresh()
      logging.info("Parsing...")
      pipe_feed = eachfeed.parse()

      for entry in pipe_feed.entries:
         title = nltk.clean_html(entry.title)
         # actually want 'id' here in order to Edit
         feedset.append((title, entry.id))

   logging.info("Done")
   return feedset
Example #28
0
def scrapePage(url):
    # Extract page text from a web URL (ignoring navigation links, ads, etc.).
    try:
        print "URL: "+ url
        #url=url.replace('(','%28')
        #url=url.replace(')','%29')
        #print "New URL:"+url
        result = alchemyObj.URLGetText(url)
        
        soup = BeautifulSoup(result)
        raw = soup('text')
        raw = [text.text for text in raw]
        
        rawstr = ' '.join(raw)
        
    except Exception:
        try:
            print "\n\nscraping using regex"
            webpage = urllib2.urlopen(url).read()
    #webpage = str(webpage)
            para= re.compile('<p>(.*)</p>') #collect data in p tags and store in para object
            raw = re.findall(para , webpage)
            rawstr = ' '.join(raw) 
            clean_raw = nltk.clean_html(rawstr)
            rawstr=clean_raw
        except Exception:
            rawstr = "Web page could not be scraped..."


    print rawstr
    return rawstr
def fcount(url):
	import urllib2
	import nltk
	import re
	rcount=0
	try:
		rpage=urllib2.urlopen(url).read()
	except:
		return rcount
	
	tbegin=rpage.find("user-rating")
	tend=rpage.find("review-list")
	temp=rpage[tbegin:tend]

	tbegin=temp.find(" Write a Review")
	tend=temp.find("review-list")
	temp=temp[tbegin:tend]

	tbegin=temp.find("of")
	tend=temp.find("review-list")
	temp=temp[tbegin:tend]

	tbegin=temp.find("of")
	tend=temp.find("review-list")
	temp=temp[tbegin:tend]

	tbegin=temp.find("<strong")
	tend=temp.find("</strong>")
	temp=temp[tbegin:tend]

	temp=nltk.clean_html(temp)
	rcount=temp
	rcount=re.sub(r'[^0-9]','',rcount)
	return int(rcount)
 def extractchunk(tweettuple):
     sentences = [nltk.tokenize.sent_tokenize(nltk.clean_html(str(w))) for (a,w) in tweettuple]
     cid = [str(a) for (a,w) in tweettuple]
     tokens = [nltk.tokenize.word_tokenize(str(s)) for s in sentences]
     pos_tagged_tokens = [nltk.pos_tag(t) for t in tokens]
     ne_chunks = nltk.batch_ne_chunk(pos_tagged_tokens)
     return dict(zip(cid, ne_chunks))
tokens = nltk.word_tokenize(raw)
type(tokens)
len(tokens)
tokens[80:110]

text = nltk.Text(tokens)
text.collocations()

# Online articles

url = "http://www.bbc.co.uk/news/science-environment-21471908"
#Getting text out of HTML is a sufficiently common task that NLTK provides a helper function nltk.clean_html(), which takes an HTML string and returns raw text.
html = urlopen(url).read()
html[:60]
raw = nltk.clean_html(html)
tokens = nltk.word_tokenize(raw)
tokens[:15]

#Processing RSS Feeds: import feedparser

# Reading local files

f = open('C:\Data\Files\UK_natl_2010_en_Lab.txt')
raw = f.read()
print raw[:100]

# User input
s = raw_input("Enter some text: ")

#Regular exppressions applications. Find and count all vowels.
            #else:
                #print "skipped twitter"
                
            #searchset.append('next') # for testing a new way out.

#print " List of Seach Result Pages = %s" % searchset
#exit()

row_data1 = []

# Creating content array of all the pages returned from Google 
for testurls in searchset:
	filename1=myopener.open(testurls).read()
	readable_data1= Document(filename1).summary()
	# Removing the HTML tags from the web page for processing
	tempval = nltk.clean_html(readable_data1)
	row_data1.append(tempval)
	#print datetime.datetime.now() - t0 

#print row_data1
result = []
temp_result=[_getAnswer("",row_data,node) for row_data in row_data1]

#print "Result Set = %s "% result

word_freq = {}

# Count the frequency of accorunces of results from all pages
for word in result:
    word_freq[word] = word_freq.get(word, 0) + 1
Example #33
0
from __future__ import division
import nltk, re, pprint

# 3.1 访问网络电子书
from urllib.request import urlopen
url = "http://www.gutenberg.org/files/2554/2554.txt"
raw = urlopen(url).read()
print(len(raw))
# 指定代理的读取网络文件
proxies = {'http': 'http://www.someproxy.com:3128'}
raw = urlopen(url, proxies=proxies).read()
# 分词(NLTK 自带正则分词器)
tokens = nltk.word_tokenize(raw)  # 产生我们所熟悉的结构,一个词汇和标点符号的链表
text = nltk.Text(tokens)
# 寻找文本开头结束位置.
raw.find("PART I")
raw.rfind("End of Project Gutenberg's Crime")

# 3.2 处理的 HTML
url = "http://news.bbc.co.uk/2/hi/health/2284783.stm"
html = urlopen(url).read()
raw = nltk.clean_html(html)  #返回原始文本
tokens = nltk.word_tokenize(raw)
# 寻找开头结尾,获得需要的文本
# 2 处理搜索引擎结果

# 3.3 词干提取器(自带/正则自定义0

# 3.4 词形归并
Example #34
0
def create_features(X, user_data=None):
    res = []

    for date, comment, user in X:
        feat = {}
        has_hate_word = has_drug_word = has_cult_word = has_occult_word = has_porn_word = 0
        has_fwenzel_word = 0
        has_swastika = swastika in comment

        comment = comment.lower()

        comment = parse_text(comment)

        comment = nltk.clean_html(comment)

        sents = sent_tokenize(comment)
        doc = []
        for sent in sents:
            # Tokenize each sentence.
            doc += wordtokenizer.tokenize(sent)

        def repl_filter(x):
            return x.lower() not in ["nl", "nl2", "nbsp", "nbsp2", "dummyhtml"]

        # Remove stopwords and replacement tokens.
        doc = filter(repl_filter, doc)

        for i, word in enumerate(doc):
            if doc[i] in bad_words:
                doc[i] = '_badword_'

            doc[i] = ps.stem(doc[i])

            doc[i] = wnl.lemmatize(doc[i])

            if doc[i] in bad_words:
                doc[i] = '_badword_'

            if doc[i] in hate_words:
                has_hate_word = 1
            if doc[i] in drug_words:
                has_drug_word = 1
            if doc[i] in cult_words:
                has_cult_word = 1
            if doc[i] in occult_words:
                has_occult_word = 1
            if doc[i] in porn_words:
                has_porn_word = 1
            if doc[i] in fwenzel_words:
                has_fwenzel_word = 1

        bigram_finder = BigramCollocationFinder.from_words(doc)
        bigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, n=5)

        bigram = dict([(ngram, True)
                       for ngram in itertools.chain(doc, bigrams)])

        feat.update(bigram)

        text_vocab = set(w for w in doc if w.isalpha())
        unusual = text_vocab.difference(english_vocab)
        unusual_ratio = len(unusual) / len(text_vocab) if len(
            text_vocab) != 0 else -1.0

        unusual2 = unusual.difference(set("_badword_"))
        unusual_ratio2 = len(unusual2) / len(text_vocab) if len(
            text_vocab) != 0 else -1.0

        if user_data is not None:
            user_info = user_data[user]

        has_bad_word = True
        for word in bad_words:
            if word in comment.lower():
                break
        else:
            has_bad_word = False

        def n_none(x):
            return int(x) if x is not None else 0

        def c_none(x):
            return x if x is not None else "__None__"

        readability = ReadabilityTool(comment)

        read_feat = {}
        for f, val in readability.analyzedVars.items():
            if f != 'words':
                read_feat["_" + f] = val
        for test, val in readability.tests_given_lang['eng'].items():
            read_feat["__" + test] = val(readability.text)

        feat['_always_present'] = True
        feat['_word_num'] = len(doc)
        feat['_sent_num'] = len(sents)
        feat['_word_var'] = len(set(doc)) / len(doc) if len(doc) != 0 else -1.0
        feat['_sent_var'] = len(set(sents)) / len(sents)
        feat['_unusual_ratio'] = unusual_ratio
        feat['_unusual_ratio2'] = unusual_ratio2
        if user_data is not None:
            feat['_username'] = user
            feat['_user_subcount'] = int(user_info['SubscriberCount'])
            feat['_user_friends'] = int(user_info['FriendsAdded'])
            feat['_user_favs'] = int(user_info['VideosFavourited'])
            feat['_user_videorates'] = int(user_info['VideosRated'])
            feat['_user_videouploads'] = int(user_info['VideosUploaded'])
            feat['_user_videocomments'] = int(user_info['VideosCommented'])
            feat['_user_videoshares'] = int(user_info['VideosShared'])
            feat['_user_usersubs'] = int(user_info['UserSubscriptionsAdded'])
            feat['_user_gender'] = c_none(user_info['Gender'])
            feat['_user_age'] = n_none(user_info['Age'])
            feat['_user_closed'] = user_info['UserAccountClosed']
            feat['_user_suspended'] = user_info['UserAccountSuspended']
            feat['_user_has_gender'] = 1 if user_info[
                'Gender'] is not None else 0
            feat['_user_has_school'] = 1 if user_info[
                'School'] is not None else 0
            feat[
                '_user_has_books'] = 1 if user_info['Books'] is not None else 0
            feat['_user_has_movies'] = 1 if user_info[
                'Movies'] is not None else 0
            feat[
                '_user_has_music'] = 1 if user_info['Music'] is not None else 0
            feat['_user_has_location'] = 1 if user_info[
                'Location'] is not None else 0
            feat['_user_has_hometown'] = 1 if user_info[
                'Hometown'] is not None else 0
    #        feat['_user_last'] = user_info['LastWebAccess']

    # Dictionary features
        feat['_has_bad_word'] = has_bad_word
        #        feat['_has_hate_word'] = has_hate_word
        #        feat['_has_drug_word'] = has_drug_word
        feat['_has_cult_word'] = has_cult_word
        feat['_has_swastika'] = has_swastika
        #        feat['_has_occult_word'] = has_occult_word
        #        feat['_has_has_fwenzel_word'] = has_fwenzel_word
        feat['_has_porn_word'] = has_porn_word
        feat['_has_swastika'] = has_swastika
        feat.update(read_feat)

        #        print feat
        res.append(feat)
    return res
Example #35
0
import nltk
from nltk import word_tokenize
from urllib import request

# change the path to where the nltk data is being stored
nltk.data.path.append('/Users/zhi/Documents/Programming/PROJECTS_Python/data')

# html
html = urlopen(url).read()  # download web page
raw = nltk.clean_html(html)  # strip remaining html
raw = raw[750:23506]  # trim to desired content

# ascii
tokens = nltk.wordpunct_tokenize(raw)  # tokenize the text
tokens = tokens[20:1834]  # select tokens of interest
text = nltk.Text(tokens)  # create nltk text

# vocab
words = [w.lower() for w in text]  # normalize the words
vocab = sorted(set(words))  # build the vocabulary
Example #36
0
url = 'http://www.cnn.com/2014/07/19/world/europe/ukraine-malaysia-airlines-crash/'

## fetch html
import requests
r=  requests.get(url)
html = r.content

##nltk: fetch text by cleaning html
import nltk
text = nltk.clean_html(html)

##fetch text based on density :useful text
import usefulText as u
text = u.extract_text(html)


## unicode 
text = text.decode('utf-8','ignore')

## segment into sentences
import sys
sys.path.append('../version0.0/')
import segment_sentence as ss


def isProper(sentence):
	if len(sentence) <=5:
		return False

	if '|' in sentence:
		return False
Example #37
0
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import json
import pandas as pd
import re
import nltk
import jieba

data = json.loads(open('../data/cookbook.json', 'r').read())
df = pd.DataFrame(data)
steps = []
for _s in df['steps']:
    step = []
    for s in _s:
        s = re.sub(r'\r\n', '\n', s)
        s = re.sub(r'\t', '    ', s)
        s = nltk.clean_html(re.sub(r' +', ' ', s)) + '\n'
        step.append(s)
    steps.append(''.join(step))
all_steps = ''.join(steps)

seg_list = jieba.cut(all_steps)
fdist = nltk.FreqDist(seg_list)
for m in fdist:
    print '%s : %s' % (m, fdist[m])
Example #38
0
    train_header = train_file.next()

    test_file = csv.reader(open("Test.csv", "rb"))
    test_header = test_file.next()

    result_file = open("Result.csv", "w")
    result_file.write('"Id","Tags"\n')

    traindata = []
    testdata  = []
    docs      = []
    
    print "Train Start"
    i = 0
    for data in train_file:
        tokens = re.split(r"\W+", nltk.clean_html(data[2]))
        #tokens = nltk.word_tokenize(nltk.clean_html(data[2]))
        docs.append(tokens)
        i += 1
        if i > 100000:
            break

    print "Make collection start"
    # Make the collection for calculating TF-IDF
    collection = nltk.TextCollection(docs)
    
    print "Testing data start"

    for data in test_file:
        title_tokens = nltk.word_tokenize(data[1])
        tokens = re.split(r"\W+", nltk.clean_html(data[2]))
Example #39
0
def compute_score(queryphrase="", keywords=[], answers=[], urls=[], scorevalue=0, rangevalue=0, left=False, right=False):

    urls = getGoogleLinks(queryphrase, 3)
    if keywords == []:
        keywords = getKeywords(queryphrase)
    print(keywords)
    keyword = True
    
    combinedtokens = []
    for url in urls:
        html = urlopen(url).read()
        raw = nltk.clean_html(html)
        combinedtokens += nltk.word_tokenize(raw)
    combinedtokens = [t for t in combinedtokens if len(t) > 2 and t.lower() not in ignored_words]
    querytokens = nltk.word_tokenize(queryphrase)

    # only supports two keywords
    if keyword == True:
        instances = {}
        tokenrange = findrange(len(combinedtokens))
        for word in keywords:
            for i in tokenrange:
                if combinedtokens[i] == word:
                    if word not in instances.keys():
                        instances[word] = [i]
                    else:
                        instances[word].append(i)
        combinedinstances = []
        # right now only two keywords are supported
        if len(keywords) != 1 and len(keywords) != 2:
            print "error, number of keywords must be one or two"
            return 4
        if len(keywords) == 2:
            for instanceone in instances[keywords[0]]:
                for instancetwo in instances[keywords[1]]:
                    if (instancetwo - instanceone) < 20 and (instanceone, instancetwo) not in combinedinstances:
                        combinedinstances.append((instanceone, instancetwo))
                    elif (instanceone - instancetwo) < 20 and (instancetwo, instanceone) not in combinedinstances:
                        combinedinstances.append((instancetwo, instanceone))
        # print(combinedinstances)
        relevanttokens = []
        if len(keywords) == 1:
            for instance in instances[keywords[0]]:
                relevanttokens += combinedtokens[instance - rangevalue : instance + rangevalue]
        else:
            for leftinstance, rightinstance in combinedinstances:
                relevanttokens += combinedtokens[leftinstance - rangevalue : rightinstance + rangevalue]
        # print(relevanttokens)
        relevanttokenrange = findrange(len(relevanttokens))
        scores = {}
        for answer in answers:
            answertokens = nltk.word_tokenize(answer)
            length = len(answertokens)
            for i in relevanttokenrange:
                if relevanttokens[i : (i + length)] == answertokens:
                    if answer not in scores.keys():
                        scores[answer] = scorevalue
                    else:
                        scores[answer] += scorevalue
        print scores
        return scores
Example #40
0
def element_14_snippet_getter(url, keywords=GLOBAL_KEYWORDS):

    """
    Given a URL and keywords, returns a list of text snippets including those keywords.

    Inputs:
    1. url:         str: the desired URL
    2. keywords:    list of str's: defaults to global_keywords defined in main(); the desired keywords

    Output:
    1. snippets:    list of unicode's: the desired text snippets
    """

    # Pulls clean text from the URL, devoid of HTML.
    try:
        html = urlopen(url).read()
    except UnicodeError:
        print "UnicodeError thrown. Skipped offending URL (probably not in English and thus unanalyzable)."
        return []
    soup = BS(html)
    raw = nltk.clean_html(html)

    # We don't want propaganda written by Freescale employees.
    if ("FreescaleTools_and_Software" in raw) or ("GregC" in raw) or ("MAb" in raw):
        return []

    # Finds all HTML subtrees that tell us the date of writing of each post.
    post_data = soup.find_all(class_="j-post-author")

    # Finds all HTML subtrees comprising the posts themselves.
    posts = soup.find_all(class_="jive-rendered-content")

    snippets = []
    tokenizer = ST()
    dates = []
    # Assembles the dates of writing of each post.
    for post_datum in post_data:
        date_posted = date_getter(post_datum)
        dates.append(date_posted)
    # For each post in the page, grabs the text and metadata.
    for i in range(len(posts)):
        text = posts[i].get_text()
        # Splits the text into its individual sentences so that we can pick the ones we like.
        intermediate_snippets = tokenizer.tokenize(text)
        # Grabs text containing keywords, as well as sentences preceding and following those with keywords.
        for j in range(len(intermediate_snippets)):
            for word in keywords:
                snippet = intermediate_snippets[j]
                if (word in snippet.lower()) and ("http" not in snippet.lower()):
                    offset = 1
                    for k in range(j, len(intermediate_snippets)):
                        if word not in intermediate_snippets[k].lower():
                            break
                        else:
                            offset += 1
                    if j == 0:
                        subsnippet = intermediate_snippets[j:offset]
                    else:
                        subsnippet = intermediate_snippets[j-1:j+offset]
                    # Puts the individual sentences back together.
                    tokens = []
                    for sentence in subsnippet:
                        stripped = sentence.lstrip().rstrip().encode("UTF-8")
                        tokens.append(stripped)
                    for token in tokens:
                        try:
                            if len(token) < 1500:
                                snippets.append([token, url, dates[i][0], dates[i][1], dates[i][2]])
                        except IndexError:
                            continue

    return snippets
Example #41
0
def removeHTML(data):
    """Clean up html"""
    return [nltk.clean_html(dat) for dat in data]
Example #42
0
def clean_and_tag_all():
    """
    Create new CSV containing tagged versions of all sentences
    """
    # set filepath to input
    basepath = os.path.dirname(__file__)
    file_in = os.path.abspath(
        os.path.join(basepath, '..', 'reuters/csv/single_records.csv'))
    file_out = os.path.abspath(
        os.path.join(basepath, '..', 'reuters/csv/sentences_POS.csv'))

    sentence_splitter = set_up_tokenizer()
    chunker = chunking.set_up_chunker()
    stemmer = nltk.SnowballStemmer('english')

    with open(file_in, 'rb') as csv_in:
        with open(file_out, 'wb') as csv_out:
            csv_reader = csv.DictReader(
                csv_in, ['SOURCE_ID', 'DRUGS', 'COMPANIES', 'SENTENCE'],
                delimiter=',')
            csv_writer = csv.DictWriter(csv_out, [
                'SOURCE_ID', 'SENT_NUM', 'SENTENCE', 'NO_PUNCT', 'DRUGS',
                'COMPANIES', 'POS_TAGS', 'CHUNKS'
            ],
                                        delimiter=',')
            csv_writer.writeheader()
            #csv_reader.next()

            for row in csv_reader:
                # display progress bar
                sys.stdout.write('.')
                sys.stdout.flush()

                # clean up html tags
                # named SENTENCE in the reader so it works nicely when writing row
                plaintext = nltk.clean_html(row['SENTENCE'])
                # this in particular seems to be screwing up some of the sentence splitting
                plaintext = plaintext.replace('Inc .', 'Inc.')
                # split into sentences
                sentences = sentence_splitter.tokenize(plaintext)

                if len(sentences) > 0:
                    for i, s in enumerate(sentences):

                        # TODO integrate stanford NER recognition output into this

                        # clean up sentence
                        s, no_punct = remove_punctuation(s)

                        # CHUNKING - need to include punctuation for this to be anywhere near accurate
                        tokens = nltk.pos_tag(nltk.word_tokenize(s))
                        chunks = chunker.parse(tokens)

                        # POS TAGS - don't want to include punctuation
                        tokens = nltk.word_tokenize(no_punct)
                        # put the hyphens back after tokenisation
                        # underscores mean that the tokens are better recognised when tagging
                        no_punct = no_punct.replace('_', '-')
                        s = s.replace('_', '-')
                        tags = nltk.pos_tag(tokens)

                        # STEMMING - add stemmed version of word to end of each tagged token
                        tags = [(token, tag, stemmer.stem(token.lower()))
                                for (token, tag) in tags]

                        # TODO parse tree info, chunking, something to do with stemming?
                        # ignore any rogue bits of punctuation etc
                        if len(tags) > 1:
                            # write row to file for each sentence
                            new_fields = {
                                'SENT_NUM': i,
                                'SENTENCE': s,
                                'NO_PUNCT': no_punct,
                                'POS_TAGS': tags,
                                'CHUNKS': chunks
                            }
                            row.update(new_fields)
                            csv_writer.writerow(row)

    print 'Written to sentences_POS.csv'
Example #43
0
from collections import Counter as C
import urllib2
import nltk

urls = []
with open('manisourcehtml1.html', 'r+') as f:
    urls = map(
        lambda x: "http://en.wikipedia.org" + x[13:x.index('" title')],
        filter(lambda x: x.startswith('<td><a href="/wiki/') and '(' not in x,
               f.readlines()))
print urls

import pdb
pdb.set_trace()
c = C()
with open('manibackup1.txt', 'w+') as f:
    for url in urls:
        raw = nltk.clean_html(urllib2.urlopen(url).read())
        if '^' in raw:
            raw = raw[:raw.index('^')]
        raws = raw.split()
        print url
        c.update(C(filter(lambda x:all(map(str.isalpha,x)) and len(x)>3,map(lambda x:str.lower(x),\
        raws))))
    f.write(str(c))
Example #44
0
		bad.append(pl)
		continue
		
	j=j+1
	if j%10==0: pass#print j
	
	tl = entry.get('tag_list')
	grams = set([x.strip() for x in tl.replace('-',' ').split(',')]) if tl else set()
	
	txt = entry.get('overview')
	# clean tags and text
	#    1. strip eol, apostrophes, numbers, HTML
	#    2. all other punctuation to spaces
	#    3. Break into sentences
	if txt:
		txt2 = nltk.clean_html(txt.replace("\n"," ").encode('ascii','ignore').replace('\\/','/').replace("'",""))
		txt3 = ptn5.sub(" ",ptn4.sub(".",ptn3.sub(" ",ptn2.sub("",txt2))))
		sents = ptn6.split(txt3)
	
		# tokenize sentences
		for sent in sents:
			sent1 = ptn.sub("",sent.lower().replace("."," "))
			sent2 = sent1.split()
			grams.update(set(nltk.bigrams(sent2)))
			grams.update(set(nltk.trigrams(sent2)))
	
#	gramcnt = {}
#	for gram in grams: gramcnt[gram]=gramcnt.get(gram,0)+1
	
	# save (pl,{gram:x,gram:y,gram:z,...})
	cograms.append((pl,list(grams)))
Example #45
0
                  help='output file')
(options, args) = parser.parse_args()

SAMPLE_URLS = ['http://www.henryklahola.nazory.cz/Vira.htm',
               'http://www.henryklahola.nazory.cz/Snatek.htm',] \
    if not options.sample else options.sample.split(' ')
WORDS = 500 if not options.words else int(options.words)
NGRAM = 3 if not options.bigrams else 2

samples = []
if options.sample:
    for url in SAMPLE_URLS:
        sample = unicode(
            BeautifulSoup(urlopen(url),
                          convertEntities=BeautifulSoup.HTML_ENTITIES))
        samples.append(nltk.clean_html(sample))
elif options.input:
    samples = [open(options.input).read().decode('utf8')]

tokenizer = nltk.tokenize.WordPunctTokenizer()
tokenized = tokenizer.tokenize(' '.join(samples))
warnings.simplefilter("ignore")
model = nltk.NgramModel(NGRAM, tokenized)

starts = model.generate(100)[-2:]
generated = model.generate(WORDS, starts)
out = ' '.join(generated).encode('utf8').replace(' , ',
                                                 ', ').replace(' . ', '. ')
out = '%s%s...' % (out[0].upper(), out[1:])

if options.output:
def get_first(url, count):
    raw = nltk.clean_html(urllib.urlopen(url).read())
    return (raw[:count],raw[count:])
def get_text(html):
    soup = BeautifulSoup(html)
    text = soup.find('div', id='article_body')
    #print text
    text = nltk.clean_html(str(text))
    return text
Example #48
0
def clean_up(text):
    return nltk.clean_html(xml.sax.saxutils.unescape(text))
Example #49
0
from urllib import urlopen

import csv 

from nltk.corpus import stopwords


url = "http://hbr.org/2013/04/now-is-our-time/ar/1" 
#keeping this along with other urls


proxies = {'http': 'http://*****:*****@10.1.9.23:8080'} 

raw = urlopen(url, proxies=proxies).read()

cleanraw = nltk.clean_html(raw)


#raw.txt contains the raw text
f = open('raw.txt', 'w')   
f.write(cleanraw) 
f.close()


#tokenize cleanraw
tok_clean = nltk.word_tokenize(cleanraw)


#removing all the smartquotes

clean = []
Example #50
0
import requests
import json
import nltk
url="http://*****:*****@localhost:7474/db/data/cypher"
url1="http://*****:*****@localhost:7474/db/data/node/"
payload={}
for i in resp:
	elastic=url1+str(i[0])
	payload['link']=uri+str(i[0])
	payload['title']=i[1]
	payload['content']=nltk.clean_html(i[2])
	#print url1
	print elastic
	print payload
	print requests.put(elastic,data=json.dumps(payload)).json()
Example #51
0
def clean_and_tag():
    """ Create new CSV containing all relevant sentences """

    # set filepath to input
    basepath = os.path.dirname(__file__)
    file_in = 'data/reuters/press_releases/PR_drug_company_500.csv'
    file_in = os.path.abspath(os.path.join(basepath, '..', '..', file_in))
    file_out = os.path.abspath(
        os.path.join(basepath, '..', 'reuters/sentences_POS.csv'))

    # set up sentence splitter with custom parameters
    punkt_params = punkt.PunktParameters()
    # sentences are not split ending on the given parameters, using {} creates a set literal
    punkt_params.abbrev_types = {
        'inc', 'inc ', '.tm', 'tm', 'no', 'i.v', 'drs', 'u.s'
    }
    # the tokenizer has to be unpickled so better do it once here than every time it is used
    sentence_splitter = punkt.PunktSentenceTokenizer(punkt_params)

    with open(file_in, 'rb') as csv_in:
        with open(file_out, 'wb') as csv_out:
            # TO DO use dictionary reader to avoid using magic numbers for columns
            csv_reader = csv.reader(csv_in, delimiter=',')
            csv_writer = csv.writer(csv_out, delimiter=',')

            # write column headers on first row
            row = csv_reader.next()
            row.append('POS TAGS')
            csv_writer.writerow(row)

            for row in csv_reader:
                # use stdout to avoid spaces and newlines
                sys.stdout.write('.')
                # need to flush the buffer to display immediately
                sys.stdout.flush()

                # clean up html tags
                plaintext = nltk.clean_html(row[1])
                drug = row[3]
                company = row[5]
                src = row[0]

                # only consider texts containing both the drug and company
                if drug in plaintext and company in plaintext:
                    sentences = sentence_splitter.tokenize(plaintext)

                    # filter for only sentences mentioning drug, company or both
                    # TO DO coreference resolution to find more relevant sentences
                    sentences = [
                        s for s in sentences if drug in s or company in s
                    ]

                    if len(sentences) > 0:
                        for s in sentences:
                            # remove punctuation, still want to add original sentence to CSV though
                            no_punct = re.findall(r'[\w\$\xc2()-]+', s)
                            no_punct = ' '.join(no_punct)
                            tokens = nltk.word_tokenize(no_punct)
                            tags = nltk.pos_tag(tokens)

                            # TO DO parse tree info, something to do with stemming?
                            # write row to file for each sentence
                            row.append(tags)
                            csv_writer.writerow(
                                [src, s, row[2], drug, row[4], company, tags])
Example #52
0
def parse_page_text(url):
    response = requests.get(url, headers={'User-agent': USER_AGENT})
    html = response.text
    readable_html = readability.readability.Document(html)
    try:
        article_only = readable_html.summary()
    except:
        return []
    raw = nltk.clean_html(article_only)
    #soup = bs4.BeautifulSoup(html)
    #raw = nltk.clean_html(str(soup))
    sents = nltk.sent_tokenize(raw)
    sents = [nltk.wordpunct_tokenize(sent) for sent in sents]
    #sents = [nltk.tokenize.WhitespaceTokenizer().tokenize(sent) for sent in sents]
    tagged_sents = [nltk.pos_tag(sent) for sent in sents]

    # get interesting collocations
    #words = nltk.wordpunct_tokenize(raw)
    words = nltk.tokenize.WhitespaceTokenizer().tokenize(raw)
    words = [word.lower() for word in words]
    punctuation = re.compile(r'[-.?!,":;()]')
    good_words = [punctuation.sub("", word) for word in words]
    bigram_finder = nltk.collocations.BigramCollocationFinder.from_words(
        good_words)
    trigram_finder = nltk.collocations.TrigramCollocationFinder.from_words(
        good_words)
    bigram_finder.apply_freq_filter(2)
    trigram_finder.apply_freq_filter(1)
    bigram_measures = nltk.collocations.BigramAssocMeasures()
    trigram_measures = nltk.collocations.TrigramAssocMeasures()
    collocations = []
    collocations.extend(bigram_finder.nbest(bigram_measures.pmi, 10))
    collocations.extend(trigram_finder.nbest(trigram_measures.pmi, 10))
    print "\nCOLLOCATIONS :", collocations

    # get named entities
    ne_chunks = [nltk.ne_chunk(sent, binary=True) for sent in tagged_sents]
    nes = [sub_leaves(ne_chunk, 'NE') for ne_chunk in ne_chunks]
    entities = []
    for ne in nes:
        if len(ne) == 0: continue
        ne_string = ''
        for pairs in ne:
            for pair in pairs:
                ne_string = ' '.join((ne_string, pair[0]))
        entities.append(ne_string[1:])
    print "\nNES :", entities

    # get noun phrases
    nps = []
    grammar = r"""
        NP: {<PP\$>? <JJ>* <NN.*>+} # NP
        P: {<IN>}           # Preposition
        V: {<V.*>}          # Verb
        PP: {<P> <NP>}      # PP -> P NP
        VP: {<V> <NP|PP>*}  # VP -> V (NP|PP)*
    """
    cp = nltk.RegexpParser(grammar)
    for sent in tagged_sents:
        tree = cp.parse(sent)
        for subtree in tree.subtrees():
            if subtree.node == 'NP':
                try:
                    subtree = str(subtree).split()[1:]
                except UnicodeEncodeError:
                    continue  # HACK HACK HACK
                subtree = ' '.join([item.split('/')[0] for item in subtree])
                nps.append(subtree)
    print "\nNPS :", nps
    return nps
Example #53
0
def localFile(fileInput):
    if os.path.isfile(fileInput):
        print "Scraping Local File - {0}".format(fileInput)
        mimetypes.init()
        file_type, file_encoding = mimetypes.guess_type(fileInput)
        print file_type
        if file_type == 'application/pdf':
            getPDFContent(fileInput)
        elif file_type == 'text/html':
            raw = nltk.clean_html(open(fileInput).read())
            tokens = nltk.word_tokenize(raw)
            if args.minLength or args.maxLength:
                for token in tokens:
                    if not (len(token.translate(None, charBlacklist)) < minl
                            or len(token) > maxl):
                        wordList.append(
                            str(token).translate(None, charBlacklist))
            else:
                for token in tokens:
                    wordList.append(str(token).translate(None, charBlacklist))
        elif file_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
            document = docx.opendocx(fileInput)
            sentances = docx.getdocumenttext(document)
            sentances = map(lambda s: s.encode("ascii", "ignore"), sentances)
            if args.minLength or args.maxLength:
                for sentance in sentances:
                    for word in set(sentance.split()):
                        if not (len(str(word).translate(None, charBlacklist)) <
                                minl or len(str(word)) > maxl):
                            wordList.append(
                                str(word).translate(None, charBlacklist))
            else:
                for sentance in sentances:
                    for word in set(sentance.split()):
                        wordList.append(
                            str(word).translate(None, charBlacklist))
        elif file_type == 'application/vnd.openxmlformats-officedocument.presentationml.presentation' or file_type == 'application/x-mspowerpoint.12':
            try:
                prs = pptx.Presentation(fileInput)
                text_runs = list()

                for slide in prs.slides:
                    for shape in slide.shapes:
                        if not shape.has_textframe:
                            continue
                        for paragraph in shape.textframe.paragraphs:
                            for run in paragraph.runs:
                                text_runs.append(run.text)

                if args.minLength or args.maxLength:
                    for sentance in text_runs:
                        for word in set(sentance.split()):
                            if not (len(
                                    str((word.translate(None, charBlacklist))))
                                    < minl or len(str(word)) > maxl):
                                wordList.append(
                                    str(word).translate(None, charBlacklist))
                else:
                    for sentance in text_runs:
                        for word in set(sentance.split()):
                            wordList.append(
                                str(word).translate(None, charBlacklist))
            except Exception as e:
                print 'Error opening file: {0}'.format(fileInput)
                pass
        else:  #'text/plain' or unknown format
            try:
                words = set(open(fileInput).read().split())

                if args.minLength or args.maxLength:
                    for word in words:
                        if not (len(str(
                            (word.translate(None, charBlacklist)))) < minl
                                or len(str(word)) > maxl):
                            wordList.append(
                                str(word).translate(None, charBlacklist))
                else:
                    for word in words:
                        wordList.append(
                            str(word).translate(None, charBlacklist))
            except:
                print 'Error opening file: {0}'.format(fileInput)
                pass
    else:
        print 'Error opening file: {0}'.format(fileInput)
Example #54
0
 def classification_format(self, raw, subject=None):
     msg = nltk.clean_html(raw)
     fs = self.extract_features(msg, subject)
     return fs
Example #55
0
#    taglist_reduced.append(tagitem)

# In[306]:

taglist_reduced[:10]

# In[300]:

corpora = []

for i in filenames:
    doc = open('/Users/brandomr/Sites/docs/' + i)
    text = doc.read()
    #grabs the document as variable text

    text = nltk.clean_html(text)
    #strips html formatting

    text = text.replace('&#xa0;', '\xA0')
    text = text.decode('utf-8', 'ignore')
    #gets rid of non-break space html and converts to unicode

    corpora.append(text)
    #adds to corpora

# In[301]:


#tokenizes and chunks for entity extraction
def extract_entities(text):
    entities = []
Example #56
0
def gett():
    url = "http://www.50states.com/facts/alabama.htm"
    html = urlopen(url).read()
    raw = nltk.clean_html(html)
    print(raw)
Example #57
0
def cleanHtml(html):
    return BeautifulStoneSoup(
        clean_html(html),
        convertEntities=BeautifulStoneSoup.HTML_ENTITIES).contents[0]
Example #58
0
# 16/08/12 -    code tested
#

# check for nltk
try:
    import nltk
except ImportError:
    print "No nltk module, exiting!"
    exit()

from urllib import urlopen
import re

url_home = "http://var2.astro.cz/ETD/"
html_home = urlopen(url_home).read()
raw = nltk.clean_html(html_home).split('\n')

star_name, planet = [], []

# find latest list of planets from ETD homepage
for i in range(0, len(raw)):
    if "Known transiters" in raw[i]:
        loc = i + 1

for i in range(loc, len(raw)):
    if len(raw[i].split()) == 2:
        star_name.append(raw[i].split()[0])
        planet.append(raw[i].split()[1])

    if len(raw[i].split()) == 3:
        name = str(raw[i].split()[0]) + "%20" + str(raw[i].split()[1])
def html2text(str):
    return clean_html(str)
Example #60
0
def claim_ext(htmlString):
    a = htmlString.find("Claims")
    b = htmlString.find("Description")
    elem = nltk.clean_html(htmlString[a:b])
    return elem