Beispiel #1
0
def do_category():

  print "[1] %s - loading titles" % (diff())

  cat_titles = load_json("categories", "all_cats_3000")

  n_docs = sum(len(v) for k,v in cat_titles.iteritems())

  print "[2] %s - loading pos tags, n_docs = %s" % (diff(), n_docs)

  cat_titles_pos = list( (cat, t, rpos.value_by_title(cat,t)) for cat, ts in cat_titles.iteritems() for t in ts)

  print "[3] %s - extractings words" % diff()

  def calc_idf(w):
    n = count_occurrences(w, cat_titles_pos)
    try:
      return (w, math.log(n_docs / float(n)))
    except ZeroDivisionError:
      return (w, 0)
  
  words = []
  for c_t_p in cat_titles_pos:
    words += c_t_p[2].keys()
  words = set(words)

  print "[4] %s - calculating idfs, n_words = %s" % (diff(), len(words))

  words_idfs = dict(map(calc_idf, words))

  print "[5] %s - saving idfs" % diff()

  ridf.puts("all", words_idfs)

  print "[6] %s - method finisheed" % (diff())
Beispiel #2
0
def update_symspell_dict(input_file, output_file):
    '''
    @description: 得到symspell key-value词典
    @param {type} 
    @return:
    '''
    dict_tmp = symspell(
        input_file=input_file,
        res_dict=load_json(
            "/home/zixiang/Projects/text_correction/codebase/data/symspell_dict.json"
        ))
    save_json(dict_tmp, output_file)
Beispiel #3
0
def test():
    '''
    @description:  测试symspell功能
    @param {type} 
    @return:
    '''
    #dict_tmp = symspell("/home/zixiang/ExtraData/xmnlp/tests/sample_words.txt")
    #save_json(dict_tmp,"/home/zixiang/Projects/text_correction/codebase/data/symspell.json")

    tmp_dict = load_json(config.new_symspell_json)

    #print("tmp_dict", tmp_dict)
    print(cheaksmyspell(tmp_dict, "小孩同学"))
Beispiel #4
0
def get_tmp_result():
    global count
    tmp_dict = load_json(
        "/home/zixiang/Projects/text_correction/codebase/data/symspell_dict.json"
    )
    print("symspell_dict loaded")
    #print("tmp_dict", tmp_dict)
    #print(cheaksmyspell(tmp_dict,"新兴肥业"))
    import pandas as pd

    df = pd.read_excel("/home/zixiang/Downloads/热词评测20200123.xlsx",
                       sheet_name=2)

    #print(df["query"])
    df["new_result"] = df["query"].apply(cheaksmyspell_apply)

    print(df.head())
    df.to_excel("/home/zixiang/Downloads/热词评测20200123_new.xlsx",
                index=None,
                columns=["query", "rewrite_query", "new_result"],
                encoding="utf_8")
    print("count 影响面", count)
Beispiel #5
0
def cheaksmyspell_apply(x):
    '''
    @description: 
    @param {type} 
    @return: 
    '''
    global count
    pattern_type = 0
    symspell_dict = load_json(
        "/home/zixiang/Projects/text_correction/codebase/data/symspell_sample_dict.json"
    )
    tmp_condidate = set()
    if find_pattern(x) is not None:
        print(find_pattern(x))
        x, pattern_type = find_pattern(x)
    deletes = get_deletes(x)

    #查询symspell词典
    for d in deletes:
        if d in symspell_dict:
            for i in symspell_dict[d][0]:
                tmp_condidate.add(i)

    if len(tmp_condidate) == 0:
        return x
    else:
        count += 1
        if pattern_type == 0:
            return str(list(tmp_condidate)[0])
        elif pattern_type == 1:
            return "怎么预防" + str(list(tmp_condidate)[0])
        elif pattern_type == 2:
            return "什么是" + str(list(tmp_condidate)[0])
        elif pattern_type == 3:
            return str(list(tmp_condidate)[0]) + "怎么预防"
        elif pattern_type == 4:
            return str(list(tmp_condidate)[0]) + "是什么"
        elif pattern_type == 5:
            return "播放" + str(list(tmp_condidate)[0]) + "的新闻"
Beispiel #6
0
def test_pattern_symspell(query: list) -> list:
    '''
    @description: 使用提取的pattern树对错的句子进行纠错。
    @param {type} 
    @return: 
    '''
    trie = load_pattern_trie_tree()
    symspell_dict = load_json(
        "/home/zixiang/Projects/text_correction/codebase/data/symspell_dict.json"
    )
    base_path = "/home/zixiang/DataSets/berttestdata/xiaoaiquerylog"

    tmp_condidate = set()
    isword, word_path, pattern = trie.search(query)

    #logger.debug("pattern {} wororigin_patternd_path {}".format(pattern,word_path))
    if not isword and len(word_path) == 0:
        pass

    add_flag = True
    origin_words_list = [i for i in word_path.split("*") if i]
    tmp_string = ""

    for i in pattern:
        if i == "*":
            if add_flag:
                tmp_string += i
                add_flag = False
        else:
            tmp_string += i
            add_flag = True

    origin_pattern = tmp_string.replace("*", "{}")

    #logger.debug("origin_pattern {}".format(origin_pattern))
    #logger.debug("origin_word_list {}".format(origin_words_list))
    length = len(origin_words_list)

    tmp_list = []
    for i in range(length):
        tmp_list.append([])

    #处理多个位置有错误的情况
    for i in range(length):
        deletes = get_deletes(origin_words_list[i])
        #查询symspell词典
        for d in deletes:
            if d in symspell_dict:
                for j in symspell_dict[d][0]:
                    if j in tmp_list[i]:
                        continue
                    tmp_list[i].append(j)

    if len(tmp_list) == 0:
        return set()

    fn = lambda x, code=',': reduce(
        lambda x, y: [str(i) + code + str(j) for i in x for j in y], x)
    res = fn(tmp_list)

    for line in res:
        tmp_condidate.add((origin_pattern.format(*(line.split(",")))))

    return list(tmp_condidate)
Beispiel #7
0
from nltk import PorterStemmer
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords

from utils import persistence_path
from io_utils import read, load_json
from ast import literal_eval

# Pos Tagging
from textblob import TextBlob
from textblob_aptagger import PerceptronTagger


STOPS = literal_eval(read(persistence_path() + "/love_the_data/stop_words.txt"))
SPECIAL = literal_eval(read(persistence_path() + "/love_the_data/special_characters.txt"))
LETTER_FREQ = dict(load_json("love_the_data","english-letter-frequencies")["letters"])
EN_US_DICT = enchant.Dict("en_US")
EN_GB_DICT = enchant.Dict("en_GB")
TAGGER = PerceptronTagger()
PORTER = PorterStemmer()
WN_LEMMATIZER = WordNetLemmatizer()
SENTENCE_DETECTOR = data.load('tokenizers/punkt/english.pickle')


'''
  CC Coordinating conjunction
  CD Cardinal number
  DT Determiner
  EX Existential there
  FW Foreign word
  IN Preposition or subordinating conjunction
Beispiel #8
0
def load_corpus(corpus_path):
    corpus = load_json(corpus_path)

    return corpus
from io_utils import flatten_hash, load_json, save_json
from utils import good_title
from rediss import RFeature
import re


rfeature = RFeature()
pattern = re.compile("(Category\:|List of|File\:).*")
categories = ["biology", "physics", "chemistry"]

r = {}
for cat in categories:
    r[cat] = []
    l = set(good_title(title) for title in flatten_hash(load_json("categories", "%s_titles_1" % cat)) if not pattern.search(title))
    for title, fvector in rfeature.key_value_by_titles(cat, l):
        s = sum(map(lambda x: x[1], fvector)) 
        if s > 500:
            r[cat].append(title)
    

save_json("categories", "all_cats_3000", r)
#print len(r)