def get(self, id): name = self.extract('<h1>', '</h1>') end = '<div class="box">' html = self.extract('<div class="content box">', end) html = html[:html.find(end)] print '='*50 print html2txt(name) print '='*50 print html2txt(html)
def get(self, path): data = json.loads(self.html) if path == '/zaker/apps.php': data = data['data']['datas'] for i in data: print_i(i) else: if data['msg'] != 'ok':return data = data['data'] if 'articles' in data: for txt in data['articles']: if 'full_url' in txt: url = txt['full_url'] #spider.put( url) #print url else: print html2txt(data['content'])
def clean_html(html): """Strip HTML tags""" h = html2txt() h.feed(html) h.close() return h.output()
def clean_html(html): '''Strip HTML tags''' h = html2txt() h.feed(html) h.close() return h.output()
def main(fromnum, tonum, out): for number in range(fromnum, tonum+1): print number url = "http://sm.krx.co.kr/common/dictionary/kse_voca_body.jsp?dic_seq_no=%s" % number html = fetchurl(url) if html != "": try: h2t = html2txt.html2txt(url) h2t.feed(html) process(h2t.data, out) except: print "fail for number : %d" % number
def get(self,page): for html in self.extract_all('<h2><a target="_blank" href="http://blog.jobbole.com', '<!-- .entry-content -->'): id = html[:html.find('"')] title = extract('/','<',html).split(">",1)[-1] link_html = extract('<div class="entry-content">','</p>', html) link_html = extract('<p', None, link_html) txt = html2txt(link_html) if "http://" in txt: print "http://blog.jobbole.com%s"%id print title print txt print ""
def main(fromnum, tonum, out): for number in range(fromnum, tonum+1): print number url = "http://sm.krx.co.kr//common/dictionary/kse_voca_left.jsp?language=Korean&dic_typ=A001&code=%02d" % number html = fetchurl(url) if html != "": try: h2t = html2txt.html2txt(url) h2t.feed(html) idx(h2t.anchors, out) except: print "fail for number : %d" % number
def get(self, page): for html in self.extract_all( '<h2><a target="_blank" href="http://blog.jobbole.com', '<!-- .entry-content -->'): id = html[:html.find('"')] title = extract('/', '<', html).split(">", 1)[-1] link_html = extract('<div class="entry-content">', '</p>', html) link_html = extract('<p', None, link_html) txt = html2txt(link_html) if "http://" in txt: print "http://blog.jobbole.com%s" % id print title print txt print ""
def main(fromnum, tonum, out): for number in range(fromnum, tonum+1): url = "http://hometopia.com/proverb/prov-e1%c.html" % number print url html = fetchurl(url) if html != "": try: part = get_prov_list(html) h2t = html2txt.html2txt(url) h2t.feed(part) process( h2t.data, out) except: print "failed " raise
def main(fromnum, tonum, out): for number in range(fromnum, tonum + 1): url = "http://hometopia.com/proverb/prov-e1%c.html" % number print url html = fetchurl(url) if html != "": try: part = get_prov_list(html) h2t = html2txt.html2txt(url) h2t.feed(part) process(h2t.data, out) except: print "failed " raise
def getdict(paras, out): url = "http://www.kita.net/dictionary/view_dic.jsp?mode=%s&keyword=%s&seq_no=%s" % (paras[0], paras[1], paras[2]) html = fetchurl(url) if html == "": print "fetch fail:", url return try: h2t = html2txt.html2txt(url) h2t.feed(html) except: return #print h2t.data process(h2t.data, out, url) #sys.exit(0) pass
def cleanHTML(html): h = html2txt() h.feed(html) h.close() return h.output()
def api_recog(self): print 'start to API recognition' txt_file = self.post_id + '.txt' conll_file = self.post_id + '.conll' data_file = self.post_id + '.data' label_file = self.post_id + '.label' texttoconll.main(os.path.join(POST_DIR, txt_file), os.path.join(POST_DIR, conll_file)) extract_feature_cmd = "python " + os.path.join( STATIC_ROOT, 'enner.py') + " bc-ce < " + os.path.join( POST_DIR, conll_file) + " > " + os.path.join( POST_DIR, data_file) subprocess.call(extract_feature_cmd, shell=True) crfsuite_cmd = "crfsuite tag -m " + os.path.join( STATIC_ROOT, 'model_all') + " " + os.path.join( POST_DIR, data_file) + " > " + os.path.join( POST_DIR, label_file) subprocess.call(crfsuite_cmd, shell=True) entities = [] with open(os.path.join(POST_DIR, conll_file)) as fconll: flabel = open(os.path.join(POST_DIR, label_file)) labels = [line.strip() for line in flabel.readlines()] lines = fconll.readlines() # print len(labels), len(lines) for idx, line in enumerate(lines): if idx > len(lines) - 2: break if line.strip() == '': w = t = '' else: w, t = line.strip().split('\t') if lines[idx + 1].strip() == '': w2 = t2 = '' else: w2, t2 = lines[idx + 1].strip().split('\t') if labels[idx] == 'B-API': entities.append((w, w2)) flabel.close() self.data['entityList'] = [] self.data['entityIndex'] = [] pre_entity = None idx = -1 for api in entities: # print api for n in range(idx, len(self.data['full_text'])): text = html2txt(self.data['full_text'][n]).lower() temp = mytokenizer.tokenize_str(text) arr = temp.split(' ') # print temp, arr if n == idx and pre_entity == api[0] and pre_entity in arr: idx2 = arr.index(api[0]) arr = arr[idx2 + 1:] if api[0] in arr: idx2 = arr.index(api[0]) if idx2 < len(arr) - 1 and arr[idx2 + 1] == api[1]: self.data['entityList'].append(api[0]) self.data['entityIndex'].append(n) idx = n break pre_entity = api[0] print 'identified APIs: ', ' '.join(self.data['entityList'])
def get_link_text(self): return '\n'.join( [html2txt(t) for t in self.data['link_text'] if html2txt(t) != ""])
def crawler_post(self): print 'start to crawle post', self.post_id self.data['hrefs'] = [] url = 'http://stackoverflow.com/questions/%s' % self.post_id r = requests.get(url) soup = BeautifulSoup(r.text, "lxml") e = soup.find("div", {"id": "question-header"}) title_a = e.find("a", {"class": "question-hyperlink"}) self.data['title'] = title_a.text self.data['full_text'] = [] self.data['link_text'] = [] # title self.data['full_text'].append(title_a.text) self.data['link_text'].append(title_a.text) # get the text from question question_div = soup.find("div", {"class": "question"}) post_div = question_div.find("div", {"class": "post-text"}) class_parsed_list = [] for e in post_div: self.data['full_text'].append(str(e.encode('utf-8'))) if e.name != 'pre': self.data['link_text'].append(str(e.encode('utf-8'))) else: code = html2txt(str(e.encode('utf-8'))).lower() m = re.search(r'class (\S+):', code) if m: cls = m.group(1) idx = cls.find('(') if idx >= 0: cls = cls[0:idx].strip() class_parsed_list.append(cls) for e in post_div.find_all("a"): self.data['hrefs'].append(e['href']) # comment text comment_div = question_div.find("div", {"class": "comments"}) if comment_div is not None: comments = comment_div.find_all("span", {"class": "comment-copy"}) for e in comments: self.data['full_text'].append(str(e.encode('utf-8'))) self.data['link_text'].append(str(e.encode('utf-8'))) for link in e.find_all("a"): self.data['hrefs'].append(link['href']) # get the text from answers answer_div = soup.find_all('div', {"class": "answer"}) for answer in answer_div: answer_posts = answer.find_all("div", "post-text") for post in answer_posts: for e in post: self.data['full_text'].append(str(e.encode('utf-8'))) if e.name != 'pre': self.data['link_text'].append(str(e.encode('utf-8'))) else: code = html2txt(str(e.encode('utf-8'))).lower() m = re.search(r'class (\S+):', code) if m: cls = m.group(1) idx = cls.find('(') if idx >= 0: cls = cls[0:idx].strip() class_parsed_list.append(cls) for e in post.find_all("a"): self.data['hrefs'].append(e['href']) comment_div = answer.find("div", {"class": "comments"}) if comment_div is not None: comments = comment_div.find_all("span", {"class": "comment-copy"}) for e in comments: self.data['full_text'].append(str(e.encode('utf-8'))) self.data['link_text'].append(str(e.encode('utf-8'))) for link in e.find_all("a"): self.data['hrefs'].append(link['href']) self.data['tags'] = [] for e in soup.find("div", {"class": "post-taglist"}): tag = html2txt(str(e.encode('utf-8'))) if tag.strip() != "": self.data['tags'].append(tag) self.data['class_parsed_list'] = class_parsed_list print os.path.join(POST_DIR, self.post_id + '.txt') with open(os.path.join(POST_DIR, self.post_id + '.txt'), 'w') as outfile: outfile.write(mytokenizer.tokenize_str(self.get_link_text()))
def OriginalHTMLprocess(listsplit): OriginalHTMLdb = OriginalPage() ilog = infologger() purei = Purecontent("c") pat = re.compile("<([^>]|\n)*>") space = re.compile("\ \;|\©\;|\r|\t") stmk = stopmarks() md5urllist = {} for i in listsplit: md5url = md5hex(i) md5urllist[md5url] = [i] word = "" st = time.time() purei.url_md5 = md5url if purei.checkexist(): OriginalHTMLdb.url = i parser = html2txt() try: parser.feed(OriginalHTMLdb.queryoriginalct()) charset = parser.charset # charset detector parser.close() except: charset = "" Originaltext = langconvert(OriginalHTMLdb.queryoriginalct(), charset) Originaltext = Originaltext.decode("utf-8") ilog.sentence_split_info(time.time() - st) try: # If this page is normal html format parser = "" parser = html2txt() parser.feed(Originaltext) word = word + parser.text if len(word) == 0: word = word + space.sub(chr(32), pat.sub(chr(32), Originaltext)) contenttitle = clearspace(parser.title) parser.close() # print contenttitle,i,charset purei.title = contenttitle.encode("utf-8") except: try: parser = html2txt() parser.feed(Originaltext) contenttitle = clearspace(parser.title) parser.close() except: contenttitle = "" purei.title = contenttitle.encode("utf-8") word = word + space.sub(chr(32), pat.sub(chr(32), Originaltext)) context = "" ilog.sentence_split_info(time.time() - st) n = 0 for xw in word: if ord(xw) >= 32 or ord(xw) in [9, 10, 13]: context = context + xw n += 1 if n > 40000000: # may over 65535 line of a document. break ilog.sentence_split_info(purei.title + str(len(context)) + i + charset) context = context + chr(32) contline = [] contline.append("") i = 0 # line of contline list # for x in xrange(len(context)): x = 0 # word number msl = 260 while x < len(context): ordx = ord(context[x]) contline[i] = contline[i] + context[x] sentencecount = len(clearspace((contline[i]))) # sentencecount=len(contline[i]) if ( sentencecount > msl and stmk.atypestopmarks(ordx) or sentencecount > msl and context[x : x + 2] == ". " or sentencecount > msl + 20 and stmk.btypestopmarks(ordx) or sentencecount > msl + 20 and ordx == 10 and ord(context[x + 1 : x + 2]) < 65 ): nextword = context[x + 1 : x + 2] if nextword: if punctuationmarks(ord(nextword)): # at some case, chinese word will use two marks. x += 1 contline[i] = contline[i] + context[x] contline.append("") i += 1 if msl <= 16640 and i % 2: msl = msl + msl # Dobule it, Until this value bigger then 4000. x += 1 if sentencecount < msl: contline[i] = contline[i] + context[x : x + msl] x = x + msl contcleanline = [] i = 0 ilog.sentence_split_info(time.time() - st) for x in contline: cont = clearspace(x) if len(cont) > 1: if cont[0] == chr(32) and cont[-1] == chr(32): cont = cont[1:-1] elif cont[-1] == chr(32): cont = cont[:-1] elif cont[0] == chr(32): cont = cont[1:] if len(cont) < 65025 and cont != chr(32): contcleanline.append(cont.encode("utf-8")) i = i + 1 ilog.sentence_split_info(time.time() - st) purei.purecotentinline = contcleanline purei.content = clearspace(context).encode("utf-8") purei.insertPurecontent() stderr.write(".") OriginalHTMLdb.close() purei.close() return md5urllist