Ejemplo n.º 1
0
def mmseg_test():
    string = "最主要 的更  动是:张无忌最后没有选定自己的配偶。自己的自己"
    print(seg_txt(string))
    output = ""
    for i in seg_txt(string):
        output += i + " "
    print(output)
Ejemplo n.º 2
0
def get_chinese_similarity(s1, s2):
    """
    Get the similarity of two chinese word
    """
    hash1 = simhash([ smart_unicode(x) for x in seg_txt(smart_str(s1)) ])
    hash2 = simhash([ smart_unicode(x) for x in seg_txt(smart_str(s2)) ])
    return hash1.similarity(hash2)
Ejemplo n.º 3
0
  def Classify(self, text):
    text = re.sub(r'\d', r' ', text)
    text_words = [w for w in mmseg.seg_txt(text)]
    #print text_words
    category = -1
    max_weight = 0.0
    best_unknown_weight = 0.0

    for cat_id, word_weights in self.cat_word_weight_.items():
      #print '---------------------------'
      weight = 0.0
      unknown_weight = 0.0
      for word in text_words:
        if word in word_weights:
          w = word_weights[word] * (3 ** ((len(word) - 1) / 3))
          weight += w
          unknown_weight -= len(word) * 0.6
          #print word, w
        else:
          if word not in self.stop_words_:
            unknown_weight += len(word) * 1.0
            #print word, 'unknown'
          pass

      if weight > max_weight and unknown_weight < 0.0:
        max_weight = weight
        category = cat_id
        best_unknown_weight = unknown_weight
 
    # print text, ':', category, self.cat_id_name_map_[category], max_weight, best_unknown_weight
    return self.cat_id_name_map_[category]
Ejemplo n.º 4
0
 def put(self,title,item_id):
     """
     title --> segment --> sadd(phrase,item_id) -> zadd (phrase->prefix, suffix, 0)
                       --> pinyin --> sadd(phrase,item_id) -> zadd (phrase->prefix, suffix, 0)
     """
     if not title or not item_id:
         return
     
     for phrase in mmseg.seg_txt(title.encode('utf8')):
         if not phrase:
             continue
         phrase = phrase.decode('utf8')
         self._add_phrase(chinese_key(phrase),item_id)
         for (key,suffix,score) in self._gen_suffix(phrase):
             self._add_suffix(key,chinese_key(suffix),score)
         
         if not self.pinyin:
             continue
         
         phrase = self.pinyin.translate(phrase)
         if not phrase:
             continue
         
         for sub_phrase in self._gen_pinyin_phrase(phrase):
             self._add_phrase(sub_phrase,item_id)
             for (key,suffix,score) in self._gen_suffix(re.sub('\\s+','',sub_phrase)):
                 self._add_suffix(key,suffix,score)
Ejemplo n.º 5
0
def train(filename, parser):
    fname = basename(filename)
    cache_path = join(CACHE_PATH, fname)
    if exists(cache_path):
        return

    word2tag_count = {}
    for tag_id_list, txt in parser(filename):
        if not txt.strip():
            continue

        tag_id_set = set(tag_id_list)
        if not tag_id_set:
            continue

        for tid in tuple(tag_id_set):
            tag_id_set.update(PTAG.get(tid, ()))

        word2count = defaultdict(int)
        word_list = list(seg_txt(utf8_ftoj(str(txt))))
        for i in word_list:
            word2count[i] += 1

        for k, v in word2count.iteritems():
            if k not in word2tag_count:
                word2tag_count[k] = {}
            t = word2tag_count[k]
            for id in tag_id_set:
                if id not in t:
                    t[id] = 0
                t[id] += (1+log(float(v)))

    tofromfile.tofile(cache_path, word2tag_count)
Ejemplo n.º 6
0
	def remove_stop_words(self,text):
		tokens=mmseg.seg_txt(text)
		left_words=[]
		for t in tokens:
			if t not in self.stopwords:
				left_words.append(t)
		return "".join(left_words)
Ejemplo n.º 7
0
def train(filename, parser):
    fname = basename(filename)
    cache_path = join(CACHE_PATH, fname)
    if exists(cache_path):
        return

    word2tag_count = {}
    for tag_id_list, txt in parser(filename):
        if not txt.strip():
            continue

        tag_id_set = set(tag_id_list)
        if not tag_id_set:
            continue

        for tid in tuple(tag_id_set):
            tag_id_set.update(PTAG.get(tid, ()))

        word2count = defaultdict(int)
        word_list = list(seg_txt(utf8_ftoj(str(txt))))
        for i in word_list:
            word2count[i] += 1

        for k, v in word2count.iteritems():
            if k not in word2tag_count:
                word2tag_count[k] = {}
            t = word2tag_count[k]
            for id in tag_id_set:
                if id not in t:
                    t[id] = 0
                t[id] += (1 + log(float(v)))

    tofromfile.tofile(cache_path, word2tag_count)
Ejemplo n.º 8
0
 def suggest(self,phrase,start=1,limit=10,namespace='',expires=600):
     temp = re.split('\s+',phrase.strip())
     phrase = [item for item in mmseg.seg_txt(phrase.encode('utf8'))]
     phrase.extend(temp)
     phrase = map(chinese_key,phrase)
     start = (start-1)*limit
     result_key = 'ac-suggest:' + '|'.join(phrase)
     results = self.r.zrevrange(result_key,start,start+limit-1)
     if results:
         return results
     
     prefix = self.suffix_key_prefix + self.namespace
     prefix_len = len(prefix)
     phrase_keys = []
     for sub_phrase in phrase:
         key =  prefix + sub_phrase
         results = self._suggest(key, limit)
         # strip the prefix off the keys that indicated they matched a lookup
         cleaned_keys = map(lambda x: x[prefix_len:], results)
         cleaned_keys = map(lambda x: self.phrase_key_prefix + self.namespace+x, cleaned_keys)
         phrase_keys.extend(cleaned_keys)
     
     if not phrase_keys:
         return []
     #union all
     num = self.r.zinterstore(result_key,list(set(phrase_keys)))
     self.r.expire(result_key,expires)
     #results
     results = self.r.zrevrange(result_key,start,start+limit-1)
     return results
Ejemplo n.º 9
0
def split_words(text):
    """docstring for split_words"""
    
    words = []
    for i in seg_txt(text):
        words.append(i)

    return words
Ejemplo n.º 10
0
def GetTermsFrequency(text):
    ret = {}
    for w in mmseg.seg_txt(text):
        w = w.strip()
        if len(w) > 0:
            ret.setdefault(w, 0)
            ret[w] += 1
    return ret
Ejemplo n.º 11
0
def GetTermsFrequency(text):
  ret = {}
  for w in mmseg.seg_txt(text):
    w = w.strip()
    if len(w) > 0:
      ret.setdefault(w, 0)
      ret[w] += 1
  return ret
Ejemplo n.º 12
0
def store_movie(movie):
    phrase = movie["title"]
    seg_phrase = " ".join(mmseg.seg_txt(phrase))
    _pinyin_phrase = pinyin.get_pinyin(phrase)
    py_phrase = "".join([p[0] for p in _pinyin_phrase]).encode("utf-8")
    pinyin_phrase = "".join(_pinyin_phrase).encode("utf-8")
    phrase = "%s %s %s %s" % (phrase, seg_phrase, pinyin_phrase, py_phrase)
    engine.store_json(movie["id"], phrase, movie)
Ejemplo n.º 13
0
 def tf_idf(self, txt):
     tf = defaultdict(int)
     for i in seg_txt(str(txt.lower())):
         tf[i] += 1
     result = []
     for k, v in tf.iteritems():
         if k in self._idf:
             result.append((k, v*self._idf[k]))
     return result
Ejemplo n.º 14
0
 def tf_idf(self, txt):
     tf = defaultdict(int)
     for i in seg_txt(str(txt.lower())):
         tf[i] += 1
     result = []
     for k, v in tf.iteritems():
         if k in self._idf:
             result.append((k, v * self._idf[k]))
     return result
 def generate_segmented_content_file(self):
     my_file = file('ordered_segmented_content_file.txt', 'w')
     with open('ordered_content_file.txt') as f:
         for line in f:
             print "正在对第{0}行进行分词操作……".format(self.count)
             for segment in seg_txt(line):
                 my_file.write(segment + ' ')
             my_file.write('\n')
             self.count += 1
     my_file.close()
Ejemplo n.º 16
0
 def parse(self, words):
     words = SearchIndex.__to_unicode(words)
     _seg_words = [word for word in seg_txt(words)]
     seg_words = filter(None, _seg_words)
     results = []
     for word in seg_words:
         word_utf8 = SearchIndex.__to_unicode(word)
         decode_word = unidecode(word_utf8)
         key = self.cache_key_prefix + slugify(decode_word)
         results.append(key)
     return results
Ejemplo n.º 17
0
  def get_terms(self):
    values = []
    for field in self._fields:
      values.append(self._data[field].encode('utf8'))

    text = ' '.join(values)

    terms = []
    for term in seg_txt(text):
      terms.append(term.decode('utf8'))

    return terms
Ejemplo n.º 18
0
Archivo: xaql.py Proyecto: yamingd/play
def gen_terms(cont):
    if cont is None:
        return []
    cont = cont.strip()
    if len(cont)==0:
        return []
    if len(cont)<TERM_MIN_LENGTH:
        return []
    terms = [item for item in seg_txt(cont) if len(item)>TERM_MIN_LENGTH]
    if len(cont)<10:
        terms.append(cont)
    terms = list(set(terms))
    return terms
Ejemplo n.º 19
0
def main():
    count = 0
    start = time.time()
    for rawfile in listdir('tests/text'):
        text = '\n'.join(codecs.open(rawfile, 'r', 'utf-8').readlines())
        wds = mmseg.seg_txt(text.encode('utf-8'))
        o = codecs.open(os.path.join('tests/temp', os.path.basename(rawfile)), 'w', 'utf-8')
        o.write((' '.join(wds)).decode('utf-8'))
        o.close()
        count = count + 1
    print '---------------------------------------------------------------'
    print time.time() - start
    print count
    print '---------------------------------------------------------------'
Ejemplo n.º 20
0
def generate_user_dict(w_uid):
	user = Account.objects.get(w_uid=w_uid)
	wbs = user.watchweibo.all()
	wordset = Set()
	
	print 'Generating user dict with %d weibo to deal with' % len(wbs)
	
	for wb in wbs:
		for word in seg_txt(wb.text.encode('utf-8','ignore')):
			if len(word)>3:
				wordset.add(word.lower().strip())

	with open("../data/user_dict/%s.dic" % w_uid, "w") as dic_file:
		for word in wordset:
			dic_file.write("%s\n" % word)
Ejemplo n.º 21
0
 def input_raw(self, sentence, is_spam):
     """
     训练数据
     :param sentence: 训练的句子
     :param is_spam: 是否时垃圾短信
     :return:
     """
     sms = mmseg.seg_txt(sentence)
     sms = list(sms)
     for flag, word in enumerate(sms):
         offset = 0 if is_spam else 1
         if word not in self.sms_value:
             self.sms_value[word] = [1 - offset, offset]
         else:
             self.sms_value[word][offset] += 1
         self.sms_count[offset] += 1
Ejemplo n.º 22
0
def segment(string):
#    alphas = ''
#    unicode = ''
#    last_is_alpha = False
#    for char in string:
#        if char.isalpha():
#            if not last_is_alpha:
#                alphas += ' '
#            alphas += char
#        else:
#            if last_is_alpha:
#                unicode += ' '
#            unicode += char
#    print "ALPHAS", alphas
#    print "UNICODE", unicode[:20]
#    return alphas + u' '.join([ txt.decode('utf8') for txt in seg_txt( unicode.encode('utf8')) ])
    return u' '.join([ txt.decode('utf8') for txt in seg_txt( string.encode('utf8')) ])
Ejemplo n.º 23
0
    def count_occurance(self, text=''):
        if not isinstance(text, basestring):
            raise Exception("input must be instance of String")

        separated_by_non_alphanumerics = text.replace('/',' ').replace('\\',' ').replace('>',' ').replace('<',' ').lower()
        #print separated_by_non_alphanumerics
        without_one_or_two_words = self.__class__.one_or_two_words_re.sub('', separated_by_non_alphanumerics)
        without_dots = without_one_or_two_words.replace(".", "")
        text_chunks = self.stopwords.to_re().sub('', without_dots).split()
        
        frequencies = {}
        for word in text_chunks:
            seg = mmseg.seg_txt(word)
            for s in seg:
                frequencies[s] = (frequencies[s] if frequencies.has_key(s) else 0) + 1

        return frequencies
Ejemplo n.º 24
0
    def create_action(self):
        cache_key = "WEIBO:HOT:%s" %self.user.sns_id
        cache.delete(cache_key)

        tmp_cache_key = "TEMP:WEIBO:HISTORY:%s:::" %self.user.sns_id

        weibo_history = self.user.weibo_history

        for text in weibo_history:
            terms = seg_txt(text.encode('utf-8'))
            for term in terms:
                index_key = '%s%s' %(BASIC_TAG_PREFIX, term)
                if cache.exists(index_key):
                    key = tmp_cache_key + term.decode('utf-8')
                    cache.incr(name=key, amount=1)

        keys = cache.keys(pattern="%s*" %tmp_cache_key)

        for key in keys:
            name = key.split(":::")[1]
            value = float(cache.get(key))
            cache.zadd(cache_key, value, name)
            cache.delete(key)

            tag = BasicTag.get_by_name(name=name)
            if not tag:
                continue

            relations = tag.friends
            score = tag.score

            for f in relations:
                items = f.split(':::')
                obj_name = items[0]
                obj_value = float(items[1])
                result = obj_value/50*value
                cache.zadd(cache_key, result, obj_name)

        results = cache.zrevrange(name=cache_key, start=0, num=30, withscores=True)
        tags = [result[0].decode('utf-8') +'__' + str(result[1]) for result in results]

        self.user.update(set__tags=tags)
Ejemplo n.º 25
0
    def remove(self,title,item_id):
        if not title or not item_id:
            return
        
        for phrase in mmseg.seg_txt(title.encode('utf8')):
            if not phrase:
                continue
            
            phrase = phrase.decode('utf8')
            self._rem_phrase(chinese_key(phrase),item_id)

            if not self.pinyin:
                continue
            
            phrase = self.pinyin.translate(phrase)
            if not phrase:
                continue
            
            for sub_phrase in self._gen_pinyin_phrase(phrase):
                self._rem_phrase(sub_phrase,item_id)
Ejemplo n.º 26
0
def get(url, headers, body):
  query = headers.get('QUERY')
  if query == None or query.strip() == '':
    return 400, 'Bad Request', 'query field is not found.', None
  params = dict((n,v) for n, v in (i.split('=', 1) for i in query.split('&')))
  if 'query' not in params:
    return 400, 'Bad Request', 'query field is not found.', None
  text = params['query']
  search_query = helpers.decode_urlencoding(text)
  # helpers.log_search_query(search_query)
  global logger
  logger.debug('incoming query: %s', text)

  terms = seg_txt(search_query)

  logger.debug('terms from query: %s', terms)

  database = xapian.Database('../indexes/')
  enquire = xapian.Enquire(database)

  l = []
  for term in terms:
    l.append(term)

  q = xapian.Query(xapian.Query.OP_OR, l)

  enquire.set_query(q)
  matches = enquire.get_mset(0, 100)

  print '%i results found.' % matches.get_matches_estimated()
  print 'Result - %i:' % matches.size()

  r = []
  for m in matches:
    # print '%i: %i%% docid=%i [%s]' % (m.rank + 1, m.percent, m.docid,\
    # m.document.get_data())
    r.append(m.document.get_data())

  print json.dumps(r)

  return 200, 'OK', json.dumps(r), None 
Ejemplo n.º 27
0
    def predict(self, sentence):
        def is_zero(value):
            return value if value > 0 else 0.01

        sms = mmseg.seg_txt(sentence)
        sms = set(sms)
        sms_prob_ham = sms_prob_spam = 1
        for flag, word in enumerate(sms):
            word_prob_spam = word_prob_ham = 0
            if word in self.sms_value:
                value = self.sms_value[word]
                word_prob_spam = float(
                    value[0]) / self.sms_count[0]  # 这个词为spam的概率
                word_prob_ham = float(
                    value[1]) / self.sms_count[1]  # 这个词为healthy的概率
            word_prob_spam = is_zero(word_prob_spam)
            word_prob_ham = is_zero(word_prob_ham)
            ##计算
            prob_is_spam = word_prob_spam / (word_prob_spam + word_prob_ham
                                             )  # 其中word_prob_ham为补集
            sms_prob_spam *= prob_is_spam
            sms_prob_ham *= (1 - prob_is_spam)
        return sms_prob_spam / (sms_prob_spam + sms_prob_ham)
Ejemplo n.º 28
0
	def normalize_syn_words(self,text):
		tokens=mmseg.seg_txt(text)
		word_list=[x for x in tokens]
		wlist_len=len(word_list)
		for i in xrange(wlist_len):
			if word_list[i] == "":
				continue
			curr_len=0
			j = i
			while j < wlist_len:
				curr_len+=len(word_list[j])
				if curr_len > self.max_len_to_replace:
					break
				j+=1
			while j > i:
				wrf="".join(word_list[i:j])
				if wrf in self.replace_dict:
					wrt=self.replace_dict[wrf]
					word_list[i] = wrt
					for k in xrange(i+1,j):
						word_list[k] = ""
					break
				j-=1
		return "".join(word_list)
Ejemplo n.º 29
0
  def Classify(self, text):
    text = re.sub(r'\d', r' ', text)
    text_words = [w for w in mmseg.seg_txt(text)]
    #print text_words
    category = -1
    max_weight = 0.0
    best_unknown_weight = 0.0

    for cat_id, word_weights in self.cat_word_weight_.items():
      #print '---------------------------'
      weight = 0.0
      unknown_weight = 0.0
      for word in text_words:
        if len(word.strip()) == 0:
          continue
        if word in word_weights:
          w = word_weights[word] * (3 ** ((len(word) - 1) / 3))
          weight += w
          unknown_weight -= len(word) * 0.87
          #print word, w
        else:
          if word not in self.stop_words_:
            if not word[0].isalpha(): # 忽略掉不认识的英文单词
              unknown_weight += len(word) * 1.0
            #print word, 'unknown'
          else:
            #print word, 'stop word'
            pass
      #print 'unknown_weight', unknown_weight
      if weight > max_weight and unknown_weight < 0.0:
        max_weight = weight
        category = cat_id
        best_unknown_weight = unknown_weight
 
    # print text, ':', category, self.cat_id_name_map_[category], max_weight, best_unknown_weight
    return self.cat_id_name_map_[category]
Ejemplo n.º 30
0
def txt2word(txt):
    return seg_txt(utf8_ftoj(str(txt.lower())))
Ejemplo n.º 31
0
#encoding=utf-8
import mmseg
#from pymmseg import mmseg
#mmseg.dict_load_defaults()

f = open('MMSEGoutput.txt', 'w')
input = open('testinput.txt')
while True:
    text = input.readline()
    for i in mmseg.seg_txt(text):
        print >> f, i, ' ',
    #f.write(testseg)
    print >> f
    if len(text) == 0:
        break
f.flush()
f.close()
input.close()
#f=open('1.txt','w')
#for i in mmseg.seg_txt(text):
#print >>f,i
#algor = mmseg.Algorithm(text)
#for tok in algor:
#print >>f,'%s [%d..%d]' % (tok.text, tok.start, tok.end)
#print '%s' % tok.text
Ejemplo n.º 32
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
main.py
Author: WooParadog
Email:  [email protected]

Created on
2011-11-13
'''

import mmseg
import mmseg.search

f = open('text')
dic = {}
for word in mmseg.seg_txt(f.read()):
    if word in dic.keys():
        dic[word] = int(dic[word] + 1)
    else:
        dic[word] = 1
f.close()

import operator
word = sorted(dic.iteritems(), key=operator.itemgetter(1), reverse=True)
print word
f = open('word', 'w')
f.writelines([str(k) + ":" + str(v) + "\n" for (k, v) in word])
f.close()
Ejemplo n.º 33
0
    n = 0
    for name in names:
        f = os.path.join(dir, name)
        
        print '\nFile: ', f, '...'
        nout = name + '.txt'
        if os.path.exists(nout):
            print '-- SKIPPED'
            continue
        
        fout = open(nout, 'w')

        subject, text = read_eml(f)
#        words = fc(subject)
        words = seg_txt(subject)
        fout.write('{}\n\n'.format(' '.join(words)))
        
        lines = text.splitlines()
        for line in lines:
            #text = '感谢您关注语言云,您的语言云账号已经激活。这封邮件包含您调用语言云服务时使用的token,以及一些其他帮助您快速使用语言云的信息。'
            line = line.strip()
            # print '[', line, ']'
            if line <> '':
                #words = fc(line)
                words = seg_txt(line)
                fout.write(' '.join(words) + '\n')
                for w in words:
                    print w
                    
        fout.close()
Ejemplo n.º 34
0
 def append(self, txt):
     for i in set(seg_txt(str(txt.lower()))):
         self._idf[i] += 1
     self._count += 1
Ejemplo n.º 35
0
def generate_feature(wb, dict):
	fea = [0]*len(dict)
	# 微博文本
	for wd in seg_txt(wb.text.encode('utf-8','ignore')):
		word_count = 0
		wd = wd.lower().strip()
		if len(wd)>3 and wd in dict:
			fea[dict[wd]] += 1
			word_count += 1
		print 'found %d word in a weibo' % word_count

	# add user features
	owner = wb.owner
	fea.append(int(owner.w_province))
	fea.append(int(owner.w_city))
	if owner.w_url:
		fea.append(1)
	else:
		fea.append(0)
	fea.append(len(owner.w_description))
	if 'm' in owner.w_gender:
		fea.append(1)
	else:
		fea.append(0)

	fea.append(int(owner.w_followers_count))
	fea.append(int(owner.w_friends_count))
	fea.append(int(owner.w_statuses_count))
	fea.append(int(owner.w_favourites_count))
	fea.append(int(owner.w_bi_followers_count))
	fea.append((datetime.now()-owner.w_created_at).days/100)
	if owner.w_verified:
		fea.append(1)
	else:
		fea.append(0)


	# add weibo features
	fea.append(int(wb.reposts_count))
	fea.append(int(wb.comments_count))
	fea.append(int(wb.attitudes_count))
	if re.search("#.*?#", wb.text):
		fea.append(1)
	else:
		fea.append(0)

	fea.append(len(wb.text))
	own_text = re.search("(.*?)//@", wb.text)
	if own_text:
		fea.append(len(own_text.group(1)))
	else:
		fea.append(len(wb.text))
	#TODO 对source归类
	fea.append(len(wb.source))

	if wb.retweeted_status:
		fea.append(0)
	else:
		fea.append(1)

	if wb.thumbnail_pic:
		fea.append(1)
	else:
		fea.append(0)
	fea.append(wb.created_at.hour)
	fea.append(wb.created_at.weekday())
	# TODO 计算微博转发评论的衰减公式

	return fea
Ejemplo n.º 36
0
 def txt_tag_generator(self):
     word2id = self.word2id
     for k, v in self._txt_tag_generator():
         words = [i for i in list(seg_txt(str(k).lower())) if not i.isdigit()]
         yield word2id.id_list_by_word_list(words) , v
Ejemplo n.º 37
0
def tf_idf_seg_txt(txt):
    txt = txt.replace('。', ' ').replace(',', ' ')
    word_list = list(seg_txt(txt))
    return tf_idf(word_list)
Ejemplo n.º 38
0
 def tokenize(self, stream):
     import mmseg
     for chunk in self._imp_tokenizer.tokenize(stream):
         r = mmseg.seg_txt(chunk.encode('utf8', 'ignore'))
         for word in r:
             yield word.decode('utf8', 'ignore')
Ejemplo n.º 39
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
main.py
Author: WooParadog
Email:  [email protected]

Created on
2011-11-13
'''

import mmseg
import mmseg.search

f = open('text')
dic = {}
for word in mmseg.seg_txt(f.read()):
    if word in dic.keys():
        dic[word] = int(dic[word] + 1)
    else:
        dic[word] = 1
f.close()

import operator
word = sorted(dic.iteritems(), key=operator.itemgetter(1),reverse=True)
print word
f = open('word','w')
f.writelines([str(k)+":"+str(v)+"\n" for (k,v) in word])
f.close()
Ejemplo n.º 40
0
 def txt_tag_generator(self):
     word2id = self.word2id
     for k, v in self._txt_tag_generator():
         words = [i for i in list(seg_txt(str(k).lower())) if not i.isdigit()]
         yield word2id.id_list_by_word_list(words), v
Ejemplo n.º 41
0
	def separatewords(self,text):
		print  [s.lower() for s in seg_txt(text.encode('utf-8')) if s!='']
		return [s.lower() for s in seg_txt(text.encode('utf-8')) if s!='']

	def isindexed(self,url):
		u=self.con.execute \
		  ("select rowid from urllist where url='%s'" % url).fetchone()
		if u!=None:
			v=self.con.execute(
					'select * from wordlocation where urlid=%d' % u[0]).fetchone()
			if v!=None:
				print "indexed :",url
				return True
		return False

	def addlinkref(self,urlFrom,urlTo,linkText):
		fromid=self.getentryid('urllist','url',urlFrom)
		toid=self.getentryid('urllist','url',urlTo)

		cur=self.con.execute(
				"select rowid from link where fromid='%s' and toid='%s'" % (fromid,toid))
		res=cur.fetchone()
		if res==None:
			cur=self.con.execute(
				"insert into link (fromid,toid) values ('%s','%s')" %(fromid,toid))
			linkid=cur.lastrowid
		else:
		    linkid=res[0]

		words=self.separatewords(linkText)
		for word in words:
			wordid=self.getentryid('wordlist','word',word)
			cur=self.con.execute("insert into linkwords (wordid,linkid) values ('%s','%s')" %(linkid,wordid))

	def crawl(self,pages,depth=2):
		for i in range(depth):
			newpages=set()
			for page in pages:
				try:
					c=urllib2.urlopen(page)
				except:
					print "Could not open %s" % page
					continue
				soup=BeautifulSoup(c.read())

				if not self.isindexed(page):
					self.addtoindex(page,soup)
				else:
					continue

				links=soup('a')
				for link in links:
					if('href' in dict(link.attrs)):
						url=urljoin(page,link['href'])
						if url.find("'")!=-1: continue
						url=url.split('#')[0]
						if url[0:4]=='http' and not self.isindexed(url):
							newpages.add(url)
						linkText=self.gettextonly(link)
						self.addlinkref(page,url,linkText)

				self.dbcommit()
			pages=newpages

	def createindextables(self):
		self.con.execute('create table IF NOT EXISTS urllist(url)')
		self.con.execute('create table IF NOT EXISTS wordlist(word)')
		self.con.execute('create table IF NOT EXISTS wordlocation(urlid interger,wordid interger,location)')
		self.con.execute('create table IF NOT EXISTS link(fromid integer,toid integer)')
		self.con.execute('create table IF NOT EXISTS linkwords(wordid interger,linkid interger)')
		self.con.execute('create index IF NOT EXISTS wordidx on wordlist(word)')
		self.con.execute('create index IF NOT EXISTS urlidx on urllist(url)')
		self.con.execute('create index IF NOT EXISTS wordurlidx on wordlocation(wordid)')
		self.con.execute('create index IF NOT EXISTS urltoidx on link(toid)')
		self.con.execute('create index IF NOT EXISTS urlfrom on link(fromid)')
		self.dbcommit()

	def calculatepagerank(self,iterations=20):
		self.con.execute('drop table if exists pagerank')
		self.con.execute('create table pagerank(urlid primary key,score)')
		self.con.execute('insert into pagerank select rowid, 1.0 from urllist')
		self.dbcommit()
		for i in range(iterations):
			print "Iteration %d" % (i)
			for (urlid,) in self.con.execute('select rowid from urllist'):
				pr=0.15
				for (linker,) in self.con.execute('select distinct fromid from link where toid=%d' % urlid):
					linkingpr=self.con.execute('select score from pagerank where urlid=%d' % linker).fetchone()[0]
					linkingcount=self.con.execute('select count(*) from link where fromid=%d' % linker).fetchone()[0]
					pr+=0.85*(linkingpr/linkingcount)
					self.con.execute('update pagerank set score=%f where urlid=%d' % (pr,urlid))
		self.dbcommit()
Ejemplo n.º 42
0
#!/usr/bin/env python
#coding:utf-8

import sys
from __future__ import print_function
from mmseg import seg_txt
for line in sys.stdin:
  blks = str.split(line)
  out_line = blks[0]
  for i in range(1, len(blks)):
    if blks[i] == "[VOCALIZED-NOISE]" or blks[i] == "[NOISE]" or blks[i] == "[LAUGHTER]":
      out_line += " " + blks[i]
      continue
    for j in seg_txt(blks[i]):
      out_line += " " + j
  print(out_line)
Ejemplo n.º 43
0
 def append(self, txt):
     for i in set(seg_txt(str(txt.lower()))):
         self._idf[i] += 1
     self._count += 1
Ejemplo n.º 44
0
#coding:utf-8
#!/usr/bin/env python
import sys
from mmseg import seg_txt
for line in sys.stdin:
  blks = str.split(line)
  out_line = blks[0]
  for i in range(1, len(blks)):
    if blks[i] == "[VOCALIZED-NOISE]" or blks[i] == "[NOISE]" or blks[i] == "[LAUGHTER]":
      out_line += " " + blks[i]
      continue
    for j in seg_txt(blks[i]):
      out_line += " " + j
  print out_line     
Ejemplo n.º 45
0
#!/usr/bin/env python3
# coding:utf-8

import sys

from mmseg import seg_txt

for line in sys.stdin:
    blks = str.split(line)
    out_line = blks[0]
    for i in range(1, len(blks)):
        if (blks[i] == "[VOCALIZED-NOISE]" or blks[i] == "[NOISE]"
                or blks[i] == "[LAUGHTER]"):
            out_line += " " + blks[i]
            continue
        for j in seg_txt(blks[i].encode()):
            out_line += " " + j.decode()
    print(out_line)