Ejemplo n.º 1
0
    def get(self, id):
        name = self.extract('<h1>', '</h1>')

        end = '<div class="box">'
        html = self.extract('<div class="content box">', end)
        html = html[:html.find(end)]
        print '='*50
        print html2txt(name)
        print '='*50
        print html2txt(html)
Ejemplo n.º 2
0
 def get(self, path):
     data = json.loads(self.html)
     if path == '/zaker/apps.php':
         data = data['data']['datas']
         for i in data:
             print_i(i)
     else:
         if data['msg'] != 'ok':return
         data = data['data']
         if 'articles' in data:
             for txt in data['articles']:
                 if 'full_url' in txt:
                     url = txt['full_url']
                     #spider.put( url)
                     #print url 
         else:
             print html2txt(data['content'])
Ejemplo n.º 3
0
Archivo: yn.py Proyecto: mpranj/mcandre
def clean_html(html):
  """Strip HTML tags"""

  h = html2txt()
  h.feed(html)
  h.close()

  return h.output()
Ejemplo n.º 4
0
def clean_html(html):
    '''Strip HTML tags'''

    h = html2txt()
    h.feed(html)
    h.close()

    return h.output()
Ejemplo n.º 5
0
def main(fromnum, tonum, out):
	for number in range(fromnum, tonum+1):
		print number
		url = "http://sm.krx.co.kr/common/dictionary/kse_voca_body.jsp?dic_seq_no=%s" % number
		html = fetchurl(url)
		if html != "":
			try:
				h2t = html2txt.html2txt(url)
				h2t.feed(html)
				process(h2t.data, out)
			except:
				print "fail for number : %d" % number
Ejemplo n.º 6
0
 def get(self,page):
     for html in self.extract_all('<h2><a  target="_blank" href="http://blog.jobbole.com', '<!-- .entry-content -->'):
         id = html[:html.find('"')] 
         title = extract('/','<',html).split(">",1)[-1]
         link_html = extract('<div class="entry-content">','</p>', html)
         link_html = extract('<p', None, link_html)
         txt = html2txt(link_html)
         if "http://" in txt:
             print "http://blog.jobbole.com%s"%id
             print title
             print txt  
             print ""
Ejemplo n.º 7
0
def main(fromnum, tonum, out):
	for number in range(fromnum, tonum+1):
		print number
		url = "http://sm.krx.co.kr//common/dictionary/kse_voca_left.jsp?language=Korean&dic_typ=A001&code=%02d" % number
		html = fetchurl(url)
		if html != "":
			try:
				h2t = html2txt.html2txt(url)
				h2t.feed(html)
				idx(h2t.anchors, out)
			except:
				print "fail for number : %d" % number
Ejemplo n.º 8
0
 def get(self, page):
     for html in self.extract_all(
             '<h2><a  target="_blank" href="http://blog.jobbole.com',
             '<!-- .entry-content -->'):
         id = html[:html.find('"')]
         title = extract('/', '<', html).split(">", 1)[-1]
         link_html = extract('<div class="entry-content">', '</p>', html)
         link_html = extract('<p', None, link_html)
         txt = html2txt(link_html)
         if "http://" in txt:
             print "http://blog.jobbole.com%s" % id
             print title
             print txt
             print ""
Ejemplo n.º 9
0
def main(fromnum, tonum, out):
	for number in range(fromnum, tonum+1):
		url = "http://hometopia.com/proverb/prov-e1%c.html" % number
		print url
		html = fetchurl(url)
		if html != "":
			try:
				part = get_prov_list(html)

				h2t = html2txt.html2txt(url)
				h2t.feed(part)
				process( h2t.data, out)
			except:
				print "failed "
				raise
Ejemplo n.º 10
0
def main(fromnum, tonum, out):
    for number in range(fromnum, tonum + 1):
        url = "http://hometopia.com/proverb/prov-e1%c.html" % number
        print url
        html = fetchurl(url)
        if html != "":
            try:
                part = get_prov_list(html)

                h2t = html2txt.html2txt(url)
                h2t.feed(part)
                process(h2t.data, out)
            except:
                print "failed "
                raise
Ejemplo n.º 11
0
def getdict(paras, out):
	url = "http://www.kita.net/dictionary/view_dic.jsp?mode=%s&keyword=%s&seq_no=%s" % (paras[0], paras[1], paras[2])

	html = fetchurl(url)
	if html == "": 
		print "fetch fail:", url
		return

	try:
		h2t = html2txt.html2txt(url)
		h2t.feed(html)
	except:
		return

	#print h2t.data
	process(h2t.data, out, url)
	#sys.exit(0)
	pass
Ejemplo n.º 12
0
def cleanHTML(html):
    h = html2txt()
    h.feed(html)
    h.close()

    return h.output()
Ejemplo n.º 13
0
def cleanHTML(html):
    h = html2txt()
    h.feed(html)
    h.close()

    return h.output()
Ejemplo n.º 14
0
    def api_recog(self):
        print 'start to API recognition'
        txt_file = self.post_id + '.txt'
        conll_file = self.post_id + '.conll'
        data_file = self.post_id + '.data'
        label_file = self.post_id + '.label'

        texttoconll.main(os.path.join(POST_DIR, txt_file),
                         os.path.join(POST_DIR, conll_file))

        extract_feature_cmd = "python " + os.path.join(
            STATIC_ROOT, 'enner.py') + " bc-ce < " + os.path.join(
                POST_DIR, conll_file) + " > " + os.path.join(
                    POST_DIR, data_file)
        subprocess.call(extract_feature_cmd, shell=True)

        crfsuite_cmd = "crfsuite tag -m " + os.path.join(
            STATIC_ROOT, 'model_all') + " " + os.path.join(
                POST_DIR, data_file) + " > " + os.path.join(
                    POST_DIR, label_file)
        subprocess.call(crfsuite_cmd, shell=True)

        entities = []
        with open(os.path.join(POST_DIR, conll_file)) as fconll:
            flabel = open(os.path.join(POST_DIR, label_file))
            labels = [line.strip() for line in flabel.readlines()]

            lines = fconll.readlines()
            # print len(labels), len(lines)
            for idx, line in enumerate(lines):
                if idx > len(lines) - 2:
                    break

                if line.strip() == '':
                    w = t = ''
                else:
                    w, t = line.strip().split('\t')

                if lines[idx + 1].strip() == '':
                    w2 = t2 = ''
                else:
                    w2, t2 = lines[idx + 1].strip().split('\t')

                if labels[idx] == 'B-API':
                    entities.append((w, w2))
            flabel.close()

        self.data['entityList'] = []
        self.data['entityIndex'] = []
        pre_entity = None
        idx = -1
        for api in entities:
            # print api
            for n in range(idx, len(self.data['full_text'])):
                text = html2txt(self.data['full_text'][n]).lower()
                temp = mytokenizer.tokenize_str(text)
                arr = temp.split(' ')

                # print temp, arr
                if n == idx and pre_entity == api[0] and pre_entity in arr:
                    idx2 = arr.index(api[0])
                    arr = arr[idx2 + 1:]

                if api[0] in arr:
                    idx2 = arr.index(api[0])

                    if idx2 < len(arr) - 1 and arr[idx2 + 1] == api[1]:
                        self.data['entityList'].append(api[0])
                        self.data['entityIndex'].append(n)

                        idx = n
                        break

            pre_entity = api[0]

        print 'identified APIs: ', ' '.join(self.data['entityList'])
Ejemplo n.º 15
0
 def get_link_text(self):
     return '\n'.join(
         [html2txt(t) for t in self.data['link_text'] if html2txt(t) != ""])
Ejemplo n.º 16
0
    def crawler_post(self):
        print 'start to crawle post', self.post_id
        self.data['hrefs'] = []

        url = 'http://stackoverflow.com/questions/%s' % self.post_id
        r = requests.get(url)
        soup = BeautifulSoup(r.text, "lxml")

        e = soup.find("div", {"id": "question-header"})
        title_a = e.find("a", {"class": "question-hyperlink"})
        self.data['title'] = title_a.text

        self.data['full_text'] = []
        self.data['link_text'] = []

        # title
        self.data['full_text'].append(title_a.text)
        self.data['link_text'].append(title_a.text)

        # get the text from question
        question_div = soup.find("div", {"class": "question"})
        post_div = question_div.find("div", {"class": "post-text"})

        class_parsed_list = []
        for e in post_div:
            self.data['full_text'].append(str(e.encode('utf-8')))
            if e.name != 'pre':
                self.data['link_text'].append(str(e.encode('utf-8')))
            else:
                code = html2txt(str(e.encode('utf-8'))).lower()
                m = re.search(r'class (\S+):', code)
                if m:
                    cls = m.group(1)
                    idx = cls.find('(')
                    if idx >= 0:
                        cls = cls[0:idx].strip()
                    class_parsed_list.append(cls)

        for e in post_div.find_all("a"):
            self.data['hrefs'].append(e['href'])

        # comment text
        comment_div = question_div.find("div", {"class": "comments"})
        if comment_div is not None:
            comments = comment_div.find_all("span", {"class": "comment-copy"})
            for e in comments:
                self.data['full_text'].append(str(e.encode('utf-8')))
                self.data['link_text'].append(str(e.encode('utf-8')))

                for link in e.find_all("a"):
                    self.data['hrefs'].append(link['href'])

        # get the text from answers
        answer_div = soup.find_all('div', {"class": "answer"})
        for answer in answer_div:
            answer_posts = answer.find_all("div", "post-text")
            for post in answer_posts:
                for e in post:
                    self.data['full_text'].append(str(e.encode('utf-8')))
                    if e.name != 'pre':
                        self.data['link_text'].append(str(e.encode('utf-8')))
                    else:
                        code = html2txt(str(e.encode('utf-8'))).lower()
                        m = re.search(r'class (\S+):', code)
                        if m:
                            cls = m.group(1)
                            idx = cls.find('(')
                            if idx >= 0:
                                cls = cls[0:idx].strip()
                            class_parsed_list.append(cls)

                for e in post.find_all("a"):
                    self.data['hrefs'].append(e['href'])

                comment_div = answer.find("div", {"class": "comments"})
                if comment_div is not None:
                    comments = comment_div.find_all("span",
                                                    {"class": "comment-copy"})
                    for e in comments:
                        self.data['full_text'].append(str(e.encode('utf-8')))
                        self.data['link_text'].append(str(e.encode('utf-8')))

                        for link in e.find_all("a"):
                            self.data['hrefs'].append(link['href'])

        self.data['tags'] = []
        for e in soup.find("div", {"class": "post-taglist"}):
            tag = html2txt(str(e.encode('utf-8')))
            if tag.strip() != "":
                self.data['tags'].append(tag)

        self.data['class_parsed_list'] = class_parsed_list

        print os.path.join(POST_DIR, self.post_id + '.txt')
        with open(os.path.join(POST_DIR, self.post_id + '.txt'),
                  'w') as outfile:
            outfile.write(mytokenizer.tokenize_str(self.get_link_text()))
Ejemplo n.º 17
0
def OriginalHTMLprocess(listsplit):
    OriginalHTMLdb = OriginalPage()
    ilog = infologger()
    purei = Purecontent("c")
    pat = re.compile("<([^>]|\n)*>")
    space = re.compile("\&nbsp\;|\&copy\;|\r|\t")
    stmk = stopmarks()
    md5urllist = {}
    for i in listsplit:
        md5url = md5hex(i)
        md5urllist[md5url] = [i]
        word = ""
        st = time.time()
        purei.url_md5 = md5url
        if purei.checkexist():
            OriginalHTMLdb.url = i
            parser = html2txt()
            try:
                parser.feed(OriginalHTMLdb.queryoriginalct())
                charset = parser.charset  # charset detector
                parser.close()
            except:
                charset = ""
            Originaltext = langconvert(OriginalHTMLdb.queryoriginalct(), charset)
            Originaltext = Originaltext.decode("utf-8")
            ilog.sentence_split_info(time.time() - st)
            try:  # If this page is normal html format
                parser = ""
                parser = html2txt()
                parser.feed(Originaltext)
                word = word + parser.text
                if len(word) == 0:
                    word = word + space.sub(chr(32), pat.sub(chr(32), Originaltext))
                contenttitle = clearspace(parser.title)
                parser.close()
                # print contenttitle,i,charset
                purei.title = contenttitle.encode("utf-8")
            except:
                try:
                    parser = html2txt()
                    parser.feed(Originaltext)
                    contenttitle = clearspace(parser.title)
                    parser.close()
                except:
                    contenttitle = ""
                purei.title = contenttitle.encode("utf-8")
                word = word + space.sub(chr(32), pat.sub(chr(32), Originaltext))

            context = ""
            ilog.sentence_split_info(time.time() - st)
            n = 0
            for xw in word:
                if ord(xw) >= 32 or ord(xw) in [9, 10, 13]:
                    context = context + xw
                n += 1
                if n > 40000000:  # may over 65535 line of a document.
                    break
            ilog.sentence_split_info(purei.title + str(len(context)) + i + charset)
            context = context + chr(32)
            contline = []
            contline.append("")
            i = 0  # line of contline list
            # for x in xrange(len(context)):
            x = 0  # word number
            msl = 260
            while x < len(context):
                ordx = ord(context[x])
                contline[i] = contline[i] + context[x]
                sentencecount = len(clearspace((contline[i])))
                # sentencecount=len(contline[i])
                if (
                    sentencecount > msl
                    and stmk.atypestopmarks(ordx)
                    or sentencecount > msl
                    and context[x : x + 2] == ". "
                    or sentencecount > msl + 20
                    and stmk.btypestopmarks(ordx)
                    or sentencecount > msl + 20
                    and ordx == 10
                    and ord(context[x + 1 : x + 2]) < 65
                ):
                    nextword = context[x + 1 : x + 2]
                    if nextword:
                        if punctuationmarks(ord(nextword)):
                            # at some case, chinese word will use two marks.
                            x += 1
                            contline[i] = contline[i] + context[x]
                    contline.append("")
                    i += 1
                    if msl <= 16640 and i % 2:
                        msl = msl + msl  # Dobule it, Until this value bigger then 4000.
                x += 1
                if sentencecount < msl:
                    contline[i] = contline[i] + context[x : x + msl]
                    x = x + msl

            contcleanline = []
            i = 0
            ilog.sentence_split_info(time.time() - st)
            for x in contline:
                cont = clearspace(x)
                if len(cont) > 1:
                    if cont[0] == chr(32) and cont[-1] == chr(32):
                        cont = cont[1:-1]
                    elif cont[-1] == chr(32):
                        cont = cont[:-1]
                    elif cont[0] == chr(32):
                        cont = cont[1:]
                if len(cont) < 65025 and cont != chr(32):
                    contcleanline.append(cont.encode("utf-8"))
                    i = i + 1
            ilog.sentence_split_info(time.time() - st)
            purei.purecotentinline = contcleanline
            purei.content = clearspace(context).encode("utf-8")
            purei.insertPurecontent()
            stderr.write(".")
    OriginalHTMLdb.close()
    purei.close()
    return md5urllist